Пример #1
0
def output_mask(passage, answers):
    # returns a vector -- same length as the passage -- with 1 if the token is part of an answer, otherwise 0
    answer_marker = "$$answer$"  # hack !! !!! !  !  yikes ! ! ! ! !
    for answer in answers:
        replacement = " ".join([answer_marker + w for w in tokenize(answer)])
        passage = passage.replace(answer, replacement)
    return np.array([(1 if token.startswith(answer_marker) else 0)
                     for token in tokenize(passage)])
Пример #2
0
def fetch_abstracts():
    # First we need the Id and year of each document.
    results = db.query("SELECT ID, year FROM Document ORDER BY year;")

    rows = []
    for result in results:
        did = result[0]  # Document ID.
        y = result[1]  # Document Year.
        fobj = open(base_path + "nips" + year_to_str(y)
            + "/" + get_str_id(did) + ".txt", "r")

        with fobj as fo:
            abstract = get_abstract(fo.read())

        if abstract:
            text = remove_punctuation(abstract.lower())
            tokens = tokenize(text)
            cls_abs = remove_stopwords(tokens)

            rows.append((did, y, " ".join(cls_abs)))

    db.insert_into_mysql(
                'Abstract',
                abs_columns,
                rows)
Пример #3
0
def make_ipsum():
    try:
        url = request.args.get('url')
        para_size = request.args.get('para-size')
        number_para = request.args.get('number-para')
        print('pra', para_size)
        print('num', number_para)
        result = {
            "url": url,
        }
        # return 'Query string example'
        url = url.strip()
        print('url', url)

        # SiteSpider(CrawlSpider, url)
        copy = get_site_markup(url)
        clean = clean_copy(copy)
        tokenized = tokenize(clean)
        matches = match_tokens(tokenized, tags_danish)

        ipsum = ' '.join(matches)
        para_ipsum = make_para(ipsum, para_size, number_para)
        result["ipsum"] = para_ipsum

        return jsonify(data=result), 200
    except:
        return jsonify(
            error={"message": 'Ooof some of these fancy site block our bot'
                   }), 403
Пример #4
0
  def add_doc(self, doc, flag):
    """ 
      Adds doc and doc components to class data.
        -1 -> unknown
        0 -> negative
        1 -> positive
    """

    self.docs.append((doc, flag))
    
    # add the doc's words
    tokens = tokenize(doc)

    # update the vocab
    self.vocab.update(tokens)

    if flag != -1:
      self.label_count[flag] += len(tokens)

    # get the number of times that this word appears in the doc
    word_counts = {}
    for word in tokens:
      if word not in word_counts:
        word_counts[word] = 0
      word_counts[word] += 1
      
    # increment all of the word counts
    for word in tokens:
      if word not in self.words:
        self.words[word] = {}
      self.words[word][len(self.docs)-1] = word_counts[word]
Пример #5
0
def topic_extraction(collection, max_topics=100):
    """
    :collection -> MongoDB collection obtained with find() or list of documents
    :max_topics -> max number of topics to analyse K-means performance for
    """

    corpus = []
    for tweet in collection:
        corpus.append(tokenize(parse_tweet(tweet)))

    tfidf = TfidfVectorizer(  # parameters can be changed
        min_df=5,
        max_df=0.95,
        max_features=8000,
    )

    tfidf.fit(corpus)
    text = tfidf.transform(corpus)
    labels = tfidf.get_feature_names()

    K = find_optimal_size(text, max_topics)

    clusters = MiniBatchKMeans(n_clusters=K,
                               init_size=1024,
                               batch_size=2048,
                               random_state=2211).fit_predict(text)

    df = pd.DataFrame(text.todense()).groupby(clusters).mean()

    top = []
    for i, r in df.iterrows():
        top_words = ', '.join([labels[t] for t in np.argsort(r)[-10:]])
        top.append("Cluster {}: {}".format(i, top_words))
    return top
Пример #6
0
    def _predict_doc(self, x, flag):
        """ Get probability of x being positive/negative """

        if flag == 1:
            denom = self.X.num_positive()
        else:
            denom = self.X.num_negative()
        denom += self.X.vocab_size()

        # multiply word probabilities for all words in x
        words = tokenize(x)
        # prob = 1.0
        # for word in words:
        #   wi = self._doc_count_for_word(word, flag=flag)
        #   # utilize the Laplace Smooth
        #   prob *= ((float(wi)+1.0) / (float(denom)+2.0))

        prob = math.log(self.X.priors[str(flag)])
        for word in words:
            wi = self._doc_count_for_word(word, flag=flag)
            # utilize the Laplace Smooth
            prob += math.log((float(wi) + 1.0) / (float(denom) + 2.0))

        # prob *= math.log(self.X.priors[str(flag)])

        return prob
Пример #7
0
def map_line_to_seq(line, inverse_voc):
    """ Converts a string(sentence) to a sequence of integers
    Will also tokenize the sentence
    """
    return [
        inverse_voc[w] if w in inverse_voc else inverse_voc['<below_th>']
        for w in tokenize(line)
    ]
Пример #8
0
def vectorize(text, fixed_length=None):
    vocab_size = len(vocab_lookup)
    tokens = tokenize(text)
    if fixed_length is not None:
        tokens = (tokens +
                  [0] * max(0, fixed_length - len(tokens)))[:fixed_length]
    return np.array(
        [vocab_lookup.get(token, vocab_lookup[UNKNOWN]) for token in tokens])
Пример #9
0
    def send_request(self, text: str, olang: str, odomain: str) -> tuple:
        """Send prepared batch for translation.

        Endpoint receives
        msg = { "src": "hello", "conf": "ger,fml" }
        transferred in bytes via socket communication

        Args:
            text: text to translate
            olang: output language
            odomain: output domain

        Returns:
            Tuple containing response with the translation or an error.
            Type of first element is rather str or bool respectively.
        """
        msg = {"src": text.strip('|'), "conf": "{},{}".format(olang, odomain)}
        jmsg = bytes(json.dumps(msg), 'ascii')

        if self.connected:
            with self.lock:
                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                try:
                    sock.connect((self.host, self.port))
                    sock.sendall(b'HI')
                    preresponse = sock.recv(5)
                    assert preresponse == b'okay'
                    sock.sendall(bytes("msize:" + str(len(jmsg) + 13), 'ascii'))
                    politeness = sock.recv(11)
                    assert politeness == b'still okay'
                    sock.sendall(jmsg)
                    rawresponse = sock.recv(2048)
                    if rawresponse.startswith(b"msize:"):
                        in_msg_size = int(rawresponse.strip().split(b":")[1])
                        sock.sendall(b'OK')
                        rawresponse = sock.recv(in_msg_size + 13)
                    try:
                        response = json.loads(rawresponse)
                    except json.decoder.JSONDecodeError as e:
                        app.logger.debug('Received broken json', e)
                        app.logger.debug(rawresponse)
                        return False, f'Can not decode server raw response : {response}'
                    try:
                        translation = response['final_trans']
                    except KeyError:
                        app.logger.debug('Response does not contain translation')
                        app.logger.debug(response)
                        return False, f'Server response: {response}'
                    responses = tokenize(translation)
                    return responses
                except Exception:
                    return False, traceback.format_exc()
                finally:
                    sock.close()  # return tuple?
Пример #10
0
def extract_top_entities(collection, condition={"$exists": True}):
    """
    Extracts number of mentions, retweets, hashtags for a collection
    As well as top 50 words
    :collection -> MongoDB collection obtained with find() or JSON Tweets
    """
    mentions_count = {}
    retweets_count = {}
    hashtags_count = {}
    corpus = []

    for tweet in collection.find({"sentiment": condition}):

        corpus += tokenize(parse_tweet(tweet)).split(" ")
        tweeter = tweet["user"]

        # RETWEETS
        if tweet.get("retweeted_status"):
            rt_user = tweet["retweeted_status"]["user"]["screen_name"]
            if not retweets_count.get(rt_user):
                retweets_count[rt_user] = tweet["retweeted_status"]["retweet_count"] # get how many times the tweet has already been retweeted
            else:
                retweets_count[rt_user] += 1 # we have already seen this retweet: tally up another RT for this specific RT

        if tweet.get("truncated"): # if tweet is truncated we need to look through the extended tweet for entities
            tweet = tweet["extended_tweet"]

        # USER MENTIONS
        if tweet["entities"].get("user_mentions"):
            for user in tweet["entities"]["user_mentions"] + [tweeter]:
                user = user["screen_name"]
                if not mentions_count.get(user):
                    mentions_count[user] = 1
                else:
                    mentions_count[user] += 1

        # HASHTAGS
        tweet = get_body(tweet)
        if tweet["entities"].get("hashtags"):
            for h in tweet["entities"]["hashtags"]:
                hl = h["text"].lower()
                if not hashtags_count.get(hl):
                    hashtags_count[hl] = 1
                else:
                    hashtags_count[hl] += 1

    fdist = FreqDist(corpus)
    top_50 = fdist.most_common(50)

    return mentions_count, retweets_count, hashtags_count, top_50
Пример #11
0
def view_word_correlations(training_data, prop_words=0.25, n=200):
    """
    View the most commonly co-occurring words
  """

    import pandas as pd

    from helpers import tokenize

    # get the most commonly occurring prop_words % of words
    word_counts = {}
    for word in training_data.words:
        word_counts[word] = len(training_data.words[word])
    swc = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    most_common_words = set()
    for i in range(len(swc)):
        if i >= (len(training_data.words) * prop_words):
            break
        most_common_words.add(swc[i][0])

    # map words to lists of word counts
    d = {}
    for word in most_common_words:
        d[word] = []
        for doc in training_data.docs:
            tokens = tokenize(doc[0])
            d[word].append(len([t for t in tokens if t == word]))

    df = pd.DataFrame(data=d)

    def get_redundant_pairs(df):
        '''Get diagonal and lower triangular pairs of correlation matrix'''
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i + 1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

    def get_top_abs_correlations(df, n):
        au_corr = df.corr().abs().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(
            ascending=False)
        return au_corr[0:n]

    print get_top_abs_correlations(df, n=n)
Пример #12
0
def sentiment_analysis(collection):
    """
    :collection -> MongoDB collection obtained with find() or list of documents
    """

    scores = []
    ids = []

    for tweet in collection:
        text = parse_tweet(tweet)
        scores.append(find_sentiment_tb(tokenize(text)))
        ids.append(tweet['_id'])
    assert len(scores) == len(ids)

    scores = np.array(scores)
    ids = np.array(ids)

    return scores, ids
Пример #13
0
def generate_heatmap(net, para, question):
    vectors = [vectors_from_question(p, q) for p, q in [(para, question)]]
    questions = np.array([q for ((p, q), mask) in vectors])
    passages = np.array([p for ((p, q), mask) in vectors])
    mask = net.session.run(net.output, {
        net.dropout: 1,
        net.question: questions,
        net.passage: passages
    })[0]

    top_n = sorted(range(len(mask)), key=lambda i: mask[i], reverse=True)[:10]
    mask = [(1 if i in top_n else 0) for i in range(len(mask))]

    tokens = tokenize(para.passage)
    heatmap = u" ".join([
        u"<span style='background-color: rgba(255,0,0,{0})'>{1}</span>".format(
            max(0, min(1, value)), word) for value, word in zip(mask, tokens)
    ])
    html = u"<h1>{0}</h1> <p>{1}</p>".format(question.question, heatmap)
    return html
Пример #14
0
    def _get_clean_html(self,token=True):
        cleaned = nltk.clean_html(self.text)

#        if self.debug:
#            print cleaned

        normalized = normalize(cleaned)

#        if self.debug:
#            print normalized

        if token:
            tok_text = tokenize(normalized)

#            if self.debug:
#                print tok_text

            return tok_text
        else:
            return normalized
Пример #15
0
def fetch_collocations():
    # First we need fectch all the abstracts for each year.
    for year in years:
        rows = []
        results = db.query(
            "SELECT ID, abstract FROM Abstract WHERE year = 20" + year)

        year_abstracts = ""
        for abstract in results:
            year_abstracts += " " + abstract[1]

        top_colls = get_collocations(tokenize(year_abstracts), 100)

        for r in results:
            colls_to_insert = [c[0] + " " + c[1] for c in top_colls if c[0] + " " + c[1] in r[1]]
            for col in colls_to_insert:
                rows.append((r[0], 2000 + int(year), col))

        db.insert_into_mysql(
                'Collocation',
                coll_columns,
                rows)
Пример #16
0
def fetch_collocations():
    # First we need fectch all the abstracts for each year.
    for year in years:
        rows = []
        results = db.query(
            "SELECT ID, abstract FROM Abstract WHERE year = 20" + year)

        year_abstracts = ""
        for abstract in results:
            year_abstracts += " " + abstract[1]

        top_colls = get_collocations(tokenize(year_abstracts), 100)

        for r in results:
            colls_to_insert = [
                c[0] + " " + c[1] for c in top_colls
                if c[0] + " " + c[1] in r[1]
            ]
            for col in colls_to_insert:
                rows.append((r[0], 2000 + int(year), col))

        db.insert_into_mysql('Collocation', coll_columns, rows)
Пример #17
0
def fetch_abstracts():
    # First we need the Id and year of each document.
    results = db.query("SELECT ID, year FROM Document ORDER BY year;")

    rows = []
    for result in results:
        did = result[0]  # Document ID.
        y = result[1]  # Document Year.
        fobj = open(
            base_path + "nips" + year_to_str(y) + "/" + get_str_id(did) +
            ".txt", "r")

        with fobj as fo:
            abstract = get_abstract(fo.read())

        if abstract:
            text = remove_punctuation(abstract.lower())
            tokens = tokenize(text)
            cls_abs = remove_stopwords(tokens)

            rows.append((did, y, " ".join(cls_abs)))

    db.insert_into_mysql('Abstract', abs_columns, rows)
Пример #18
0
    def readfile(self):
        with open(self.path) as fp:
            extra = []
            iterator = FileMacroIterator(fp)
            for line in iterator:
                line = line.rstrip()

                if line in ManpageParser.forbidden_lines:
                    raise NotSupportedFormat(self.path)

                # Fix buggy lines
                line = ManpageParser.line_replacement.get(line, line)

                for k, v in ManpageParser.str_replacement:
                    line = line.replace(k, v)

                if Line.comment in line:
                    # Line has comment
                    line = line.split(Line.comment, 1)[0]

                if line and len(line) > 2:
                    if line[-1] == "\\":
                        if line[-2] != "\\" and line[-2] != "{":
                            extra.append(line[:-1])
                            continue
                    elif line[-2:] == "\\c":
                        extra.append(line[:-2])
                        continue

                if extra:
                    extra.append(line)
                    line = ' '.join(extra)
                    extra = []

                if line == Line.cc or line == Line.c2 or line == '\'.':
                    # Empty line (cc or c2)
                    continue

                if not line:
                    # Empty line
                    self.lines.append(('', ''))
                    continue

                if line[0] in {Line.cc, Line.c2}:
                    chunks = line[1:].lstrip().split(None, 1)

                    if not chunks:
                        # Very special case lvm2create_initrd.8
                        continue

                    macro = chunks[0]

                    if macro == '"':
                        # Bug in run.1
                        continue

                    if macro == 'b':
                        # Bug in devlink-sb.8
                        macro = 'B'

                    if macro in self.custom_macros.macros:
                        iterator.add_lines(self.custom_macros.macros[macro])
                        continue

                    if len(chunks) == 2:
                        rest = chunks[1]
                    else:
                        rest = ""

                    if macro == 'so':
                        raise RedirectedPage(self.path, rest)

                    if line.startswith(".el\\{\\"):
                        # There is a lot of crap in pages (isag.1, for instance)
                        macro = "el"
                        rest = "\\{\\" + rest

                    if macro in Macro.conditional:
                        # FIXME: This needs reworking
                        braces = 0

                        if "\\{" in rest:
                            braces += 1

                        if "\\}" in rest:
                            braces -= 1

                        while braces:
                            macro_line = next(iterator)
                            if "\\{" in macro_line:
                                braces += 1

                            if "\\}" in macro_line:
                                braces -= 1

                            if not braces:
                                break

                        continue

                    if self.parser is None:
                        if macro == "TH":
                            self.parser = ManpageParser.process_man7
                        elif macro == "Dd":
                            self.parser = False

                    if macro == 'ig':
                        while True:
                            macro_line = next(iterator)
                            if macro_line.rstrip().startswith(".."):
                                break

                        continue

                    if macro in {'de', 'de1'}:
                        self.custom_macros.add_macro(rest.strip())
                        while True:
                            macro_line = next(iterator)
                            if macro_line.rstrip().startswith(".."):
                                break
                            else:
                                self.custom_macros.add_line(macro_line)

                        continue

                    if macro in Macro.ignore:
                        continue

                    # Macro start
                    if macro == 'if':
                        # FIXME
                        continue

                    if macro in Macro.vertical_spacing:
                        self.lines.append(('', ''))
                    else:
                        self.lines.append((macro, tokenize(entitize(rest))))
                else:
                    self.lines.append(('', entitize(line)))
Пример #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--split',
        type=str,
        default='train',
        help=
        'Specify which part of the dataset you want to dump to text. Your options are: train, val, test, test-dev'
    )
    parser.add_argument(
        '--answers',
        type=str,
        default='modal',
        help=
        'Specify if you want to dump just the most frequent answer for each questions (modal), or all the answers (all)'
    )
    args = parser.parse_args()

    #nlp = English() #used for conting number of tokens
    data_dir = '/fs/project/PAS1315/VQA/Annotations/'
    data_dir1 = '/fs/project/PAS1315/VQA/Questions/'
    if args.split == 'train':
        annFile = data_dir + 'v2_mscoco_train2014_annotations.json'
        quesFile = data_dir1 + 'v2_OpenEnded_mscoco_train2014_questions.json'
        questions_file = 'data/preprocessed/questions_train2014.txt'
        questions_id_file = 'data/preprocessed/questions_id_train2014.txt'
        questions_lengths_file = 'data/preprocessed/questions_lengths_train2014.txt'
        if args.answers == 'modal':
            answers_file = 'data/preprocessed/answers_train2014_modal.txt'
        elif args.answers == 'all':
            answers_file = 'data/preprocessed/answers_train2014_all.txt'
        coco_image_id = 'data/preprocessed/images_train2014.txt'
        coco_image_path = 'data/preprocessed/images_train2014_path.txt'
        data_split = 'training data'
        subtype = 'train2014'
    elif args.split == 'val':
        annFile = data_dir + 'v2_mscoco_val2014_annotations.json'
        quesFile = data_dir1 + 'v2_OpenEnded_mscoco_val2014_questions.json'
        questions_file = 'data/preprocessed/questions_val2014.txt'
        questions_id_file = 'data/preprocessed/questions_id_val2014.txt'
        questions_lengths_file = 'data/preprocessed/questions_lengths_val2014.txt'
        if args.answers == 'modal':
            answers_file = 'data/preprocessed/answers_val2014_modal.txt'
        elif args.answers == 'all':
            answers_file = 'data/preprocessed/answers_val2014_all.txt'
        coco_image_id = 'data/preprocessed/images_val2014_all.txt'
        coco_image_path = 'data/preprocessed/images_val2014_path.txt'
        data_split = 'validation data'
        subtype = 'val2014'
    elif args.split == 'test-dev':
        quesFile = data_dir1 + 'v2_OpenEnded_mscoco_test-dev2015_questions.json'
        questions_file = 'data/preprocessed/questions_test-dev2015.txt'
        questions_id_file = 'data/preprocessed/questions_id_test-dev2015.txt'
        questions_lengths_file = 'data/preprocessed/questions_lengths_test-dev2015.txt'
        coco_image_id = 'data/preprocessed/images_test-dev2015.txt'
        coco_image_path = 'data/preprocessed/images_test-dev2015_path.txt'
        data_split = 'test-dev data'
        subtype = 'test-dev2015'
    elif args.split == 'test':
        quesFile = data_dir1 + 'v2_OpenEnded_mscoco_test2015_questions.json'
        questions_file = 'data/preprocessed/questions_test2015.txt'
        questions_id_file = 'data/preprocessed/questions_id_test2015.txt'
        questions_lengths_file = 'data/preprocessed/questions_lengths_test2015.txt'
        coco_image_id = 'data/preprocessed/images_test2015.txt'
        coco_image_path = 'data/preprocessed/images_test2015_path.txt'
        data_split = 'test data'
        subtype = 'test2015'
    else:
        raise RuntimeError(
            'Incorrect split. Your choices are:\ntrain\nval\ntest-dev\ntest')

    #initialize VQA api for QA annotations
    #vqa=VQA(annFile, quesFile)
    questions = json.load(open(quesFile, 'r'))
    ques = questions['questions']
    if args.split == 'train' or args.split == 'val':
        qa = json.load(open(annFile, 'r'))
        qa = qa['annotations']

    #pbar = progressbar.ProgressBar()
    print(
        'Dumping questions, answers, questionIDs, imageIDs, and questions lengths to text files...'
    )
    imdir = '%s/COCO_%s_%012d.jpg'
    N = len(ques)
    print('')
    print('{} Writing {} questions file {}'.format('*' * 10, args.split,
                                                   '*' * 10))
    with open(questions_file, 'w') as f:
        for i, q in zip(range(N), ques):
            f.write((q['question'] + '\n').encode('utf8'))
            print('{}/{} written.'.format(i, N), end='\r')
            sys.stdout.flush()
    print('{} Done writing {} questions file {}'.format(
        '*' * 10, args.split, '*' * 10))
    print('')
    print('{} Writing {} questions lengths file {}'.format(
        '*' * 10, args.split, '*' * 10))
    with open(questions_lengths_file, 'w') as f:
        for i, q in zip(range(N), ques):
            f.write((str(len(tokenize(q['question']))) + '\n').encode('utf8'))
            print('{}/{} written.'.format(i, N), end='\r')
            sys.stdout.flush()
    print('{} Done writing {} questions length file {}'.format(
        '*' * 10, args.split, '*' * 10))
    print('')
    print('{} Writing {} questions id file {}'.format('*' * 10, args.split,
                                                      '*' * 10))
    with open(questions_id_file, 'w') as f:
        for i, q in zip(range(N), ques):
            f.write((str(q['question_id']) + '\n').encode('utf8'))
            print('{}/{} written.'.format(i, N), end='\r')
            sys.stdout.flush()
    print('{} Done writing {} questions id file {}'.format(
        '*' * 10, args.split, '*' * 10))
    print('')
    print('{} Writing {} coco_image id file {}'.format('*' * 10, args.split,
                                                       '*' * 10))
    with open(coco_image_id, 'w') as f:
        for i, q in zip(range(N), ques):
            f.write((str(q['image_id']) + '\n').encode('utf8'))
            print('{}/{} written.'.format(i, N), end='\r')
            sys.stdout.flush()
    print('{} Done writing {} coco_image id file {}'.format(
        '*' * 10, args.split, '*' * 10))
    print('')
    print('{} Writing {} coco_image_path file {}'.format(
        '*' * 10, args.split, '*' * 10))
    with open(coco_image_path, 'w') as f:
        for i, q in zip(range(N), ques):
            image_path = imdir % (subtype, subtype, int(q['image_id']))
            f.write((image_path + '\n').encode('utf8'))
            print('{}/{} written.'.format(i, N), end='\r')
            sys.stdout.flush()
    print('{} Done writing {} coco_image_path file {}'.format(
        '*' * 10, args.split, '*' * 10))
    print('')
    print('{} Writing {} answers file {}'.format('*' * 10, args.split,
                                                 '*' * 10))
    with open(answers_file, 'w') as f:
        for i, q in zip(range(N), ques):
            if args.answers == 'modal':
                f.write(getModalAnswer(qa[i]['answers']).encode('utf8'))
            elif args.answers == 'all':
                f.write(getAllAnswer(qa[i]['answers']).encode('utf8'))
            f.write('\n'.encode('utf8'))
    print('{} Done writing {} answers file {}'.format('*' * 10, args.split,
                                                      '*' * 10))
    print('')

    print('completed dumping {}'.format(data_split))
Пример #20
0
notfound = 0
regexnotfound = 0
regexincorrect = 0
regexhits = 0
try:
	kattismatrix(a, hmm.stdin)
	kattismatrix(b, hmm.stdin)
	kattismatrix(q, hmm.stdin)

	run = False
	for person in people:
		run = not run
		if run:
			continue
		text = person["description_en"]
		tokens = helpers.tokenize(text)
		hmm.stdin.write(str(len(tokens)) + " " + " ".join([str(toState(token, wordlist)) for token in tokens]) + "\n")
		result = hmm.stdout.readline()
		result = [int(word) for word in result.split(" ")]
		values = [helpers.extract(x, type) for x in person[property].split(";")]
		weguessed = False
		correct = False

		# The regex guess
		r = pattern.search(person["description_en"])
		regexguess = None
		if r:
			regexguess = r.group(0)
			if regexguess in values:
				regexhits += 1
			else:
Пример #21
0
            for img in tweet['medias_files'].split('|'):
                retweets[img] = count

# Creating output folder
os.makedirs(output_folder_path, exist_ok=True)

# Method n°1 - tfidf token selection
# NOTE: method n°1 is inconclusive
dfs = Counter()
for item in metadata:
    captions = item['captions']

    for caption in captions:
        text = caption['caption']
        tokens = tokenize(text)

        for token in tokens:
            dfs[token] += 1

# for item in metadata:
#     captions = item['captions']

#     for caption in captions:
#         text = caption['caption']
#         tokens = tokenize(text)
#         best_token = max(tokens, key=lambda token: math.log(1 / dfs[token]))

#         print('%s -> best token is: %s' % (colored(text, 'cyan'), colored(best_token, 'red')))

# Method n°2 - prefix clustering
            output.append(array[i:i + bud_size, j:j + bud_size])
    return output


cam = cv2.imread('input/waterfall1_5.png')  #data.coffee()
cam = cv2.cvtColor(cam, cv2.COLOR_BGR2RGB)
#cam = data.coffee()

colorList = np.unique(cam.reshape(-1, cam.shape[2]), axis=0)
print("colors:", colorList)

color_codebook = dict()
color_reverse_codebook = dict()
for i, color in enumerate(colorList):
    color_codebook[i] = color
    color_reverse_codebook[tokenize(color)] = i

reduced_cam = np.zeros((cam.shape[0], cam.shape[1]))
for i in range(cam.shape[0]):
    for j in range(cam.shape[1]):
        reduced_cam[i][j] = color_reverse_codebook[tokenize(cam[i][j])]

#print(reduced_cam)

a, b, c = cam.shape
offset = 1
newCam = np.zeros((a + 2 * offset, b + 2 * offset, c))

for i in range(cam.shape[0]):
    for j in range(cam.shape[1]):
        if abs(cam.shape[0] -
Пример #23
0
	else:
		addWord('1', countmap, globalcountmap, count)

backtoback = 0
backtoprefix = 0
targettotarget = 0
targettopost = 0

run = True
for person in people:
	run = not run
	if run:
		continue
	things = person[property].split(";")
	things = map(lambda x: helpers.extract(x, type), things)
	things = [helpers.tokenize(thing) for thing in list(set(things))]
	thingsFound = [0 for thing in things]
	text = helpers.tokenize(person["description_en"])
	prevPostEnd = 0
	currWord = 0
	while currWord < len(text):
		for i, thing in enumerate(things):
			if thing[thingsFound[i]] != text[currWord]:
				thingsFound[i] = 0
				continue
			thingsFound[i] += 1
			if thingsFound[i] < len(thing):
				continue

			start = currWord - len(thing) + 1
Пример #24
0
        self.qas = [QA(qa) for qa in data['qas']]


class QA(object):
    def __init__(self, data):
        self.question = data['question']
        self.answers = data[
            'answers']  # array of dictionaries, with keys `answer_start` (a character index) and `text`


def test():
    return Dataset('data/dev-v1.1.json')


def train():
    return Dataset('data/train-v1.1.json')


if __name__ == '__main__':
    t = train()
    passage_length_distribution = [
        len(tokenize(p.passage)) for p in t.paragraphs
    ]
    question_length_distribution = [
        len(tokenize(q.question)) for para in t.paragraphs for q in para.qas
    ]
    print "Passage length distribution (tokens):", print_distribution(
        passage_length_distribution)
    print "Question length distribution (tokens):", print_distribution(
        question_length_distribution)
Пример #25
0
    del FLAGS.from_json
    config = vars(FLAGS)
    if not os.path.isdir(FLAGS.model_name):
        os.mkdir(FLAGS.model_name)
    with open(os.path.join(FLAGS.model_name, 'config.json'), 'w') as f:
        json.dump(config, f)

data = pd.read_csv(config['file'])[['question1', 'question2', 'is_duplicate']].astype(str)
print('data loaded')

cell = LSTMCell if config['cell'] == 'lstm' else GRUCell
N = data.shape[0]
inds = np.random.permutation(N)

if config['cutoff_type'] == 'count':
    q1, q2, vocab_size, words_inds = helpers.tokenize(data, cutoff_count=config['cutoff_count'])
else:
    q1, q2, vocab_size, words_inds = helpers.tokenize(data, cutoff_number=config['cutoff_nr'])

split = int(N*config['cv_ratio'])

with open(os.path.join(config['model_name'], 'words.json'), 'w') as f:
    json.dump(words_inds, f)

train = helpers.pair_iterator(q1[inds[:split]], q2[inds[:split]], data.ix[inds[:split], 'is_duplicate'].astype(int), batch=config['batch'])
test = helpers.pair_iterator(q1[inds[split:]], q2[inds[split:]], data.ix[inds[split:], 'is_duplicate'].astype(int), batch=config['batch']*4)
print('iterators created')

model = siamese.siamese(hidden_units=config['hidden'], embedding_size=config['embed'], vocab_size=vocab_size, cell=cell, 
                bidirectional=config['bidirectional'], clipping='none')
Пример #26
0
    def _get_title_and_desc(self,token=True):
        try:
            soup = BeautifulSoup(self.text, convertEntities=BeautifulSoup.HTML_ENTITIES)
        except:
            t,v,tb = sys.exc_info()
            l = traceback.format_exception(t, v,tb)
            if self.debug:
                print "".join(l)
            del t
            del v
            del tb
            return {
                "title": [],
                "description": []
            }

        title =""
        try:
            title = smart_str(soup.title.text)
            if self.debug:
                print "\n\ntitle :"
                print title
        except:
            t,v,tb = sys.exc_info()
            l = traceback.format_exception(t, v,tb)
            if self.debug:
                print "".join(l)
            del t
            del v
            del tb
            pass

        desc=""
        try:
            d = pq(self.text)
            desc = d('meta').filter("[name=description]").attr('content')
            if self.debug:
                print "\n\ndescription :"
                print desc
        except:
            t,v,tb = sys.exc_info()
            l = traceback.format_exception(t, v,tb)
            if self.debug:
                print "".join(l)
            del t
            del v
            del tb
            pass

        if token:
            tok_title = []
            tok_desc = []
            if title and len(title):
                if self.debug:
                    print 'dans le if title'
                tok_title = tokenize(normalize(title))
                if self.debug:
                    print str(type(tok_title))
                    print tok_title
            else:
                if self.debug:
                    print 'dans le else title T_T'
                pass

            if desc and len(desc):
                if self.debug:
                    print 'dans le if desc'
                tok_desc = tokenize(normalize(desc))
                if self.debug:
                    print str(type(tok_desc))
                    print tok_desc
            else:
                if self.debug:
                    print 'dans le else desc T_T'
                pass

            if self.debug:
                print tok_title
                print tok_desc

            tok_title.extend(tok_desc)
            if self.debug:
                print "retour token title desc : \n" + str(type(tok_title))
            return tok_title

        else:
            return {
                "title": title,
                "description": desc
            }
Пример #27
0
from argparser import parser
from generator import Generator
from helpers import tokenize, load_text

if __name__ == '__main__':
    generator = Generator()
    args = parser.parse_args()

    if args.action == 'fit':
        if not args.source:
            print('You must specify input text source to fit the model.')
        text = load_text(args.source)
        generator.fit(tokenize(text))
        generator.save(args.file)
    elif args.action == 'generate':
        try:
            generator.load(args.file)
        except FileNotFoundError:
            print('Data file does not exist.')
            exit(0)
        completed, sentence = generator.generate(args.n)
        if not completed:
            print('The sentence is incomplete')
        print(sentence)
    else:
        print('Please specify a correct action - either "fit" or "generate". '
              'Type -h for more info.')