def gen_distance_vec(tweet, trigger_word):
    trig = tk.tokenize(trigger_word)
    words = tk.tokenize(tweet)
    pos = -1
    for i in range(len(words)):
        if trig[len(trig) // 2] == words[i]:
            pos = i
            break
    first_p, last_p = -1, -1
    for i in range(len(words)):
        if trig[0] == words[i]:
            first_p = i
            break
    if first_p == -1:
        for i in range(len(words)):
            if trig[0] in words[i] or words[i] in trig[0]:
                first_p = i
                break
    for i in range(len(words) - 1, -1, -1):
        if trig[-1] == words[i]:
            last_p = i
            break
    if last_p == -1:
        for i in range(len(words)):
            if trig[-1] in words[i] or words[i] in trig[-1]:
                last_p = i
                break
    return [i - pos for i in range(len(words))], [first_p, last_p]
Пример #2
0
def normalize_data(stem = True):
	global contexts
	for word in words:
		# converting each sense-definition pair to sense-normalized_definition_tokens
		words[word] = map(lambda pair: [pair[0], tokenize(pair[1], stem)], words[word])
	# Normalizing contexts as well similarly
	contexts = map(lambda triple: [triple[0], triple[1], tokenize(triple[2], stem)], contexts)
Пример #3
0
def test_paragraph_markers() -> None:
    s = "[[Stutt setning.]][[]][[Önnur setning.]]"
    #    012345678901234567890123456789012345678901234567
    #    ^^^    ^       ^^ ^ ^ ^ ^    ^       ^^
    #                                   x
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 29, 37, 38]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 30, 38, 39]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 29, 37, 38, 40]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 30, 38, 39, 41]

    # The tokenize functions does stuff to paragraph markers. Test that the
    # indexes are properly calculated after that.
    # Note that the text of the dropped empty paragraph markers disappears.
    s = "[[Stutt setning.]][[]][[Önnur setning.]]"
    #    012345678901234567890123456789012345678901234567
    #    ^ ^    ^       ^^ ^     ^    ^       ^^
    #                                   x
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 24, 29, 37, 38]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 24, 30, 38, 39]
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 24, 29, 37, 38, 40]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 24, 30, 38, 39, 41]
Пример #4
0
def main():

    # takes in last argument given from console
    argument = sys.argv[-1]

    # get the file path or list of files we need, also return the name we will call our .asm file
    files = find_files(argument)

    # open the file(s) and start writing into nocomments.out
    for i in files:

        # create the nocomments.out file we will use to parse through,
        # each time it loops, it will erase and create a blank
        # nocomments.out file
        open('nocomments.out', 'w+').close()

        # run the .jack file through the strip comments to generate
        stripcomments(i)
        #now we have the .jack file with no comments

        #get the name of the file
        name = i.split('/')[-1]
        name = name.partition(".")[0]

        #tokenize to create the <name>T.xml file
        tokenize(name)
Пример #5
0
 def __init__(self, id, url, title, text):
     self.id = id.lower()
     self.url = url.lower()
     self._title = title
     self.title = list(tokenizer.tokenize(title.lower()))
     self._text = text
     self.text = list(tokenizer.tokenize(text.lower()))
Пример #6
0
def build_vocab(file, threshold, wiki_file=None):
    """Build a simple vocabulary wrapper."""
    captions = pd.read_csv(file, encoding="utf-8").text.values
    counter = Counter()

    for i, caption in enumerate(tqdm(captions)):
        tokens = tokenize(caption)
        counter.update(tokens)

        # if (i + 1) % 1000 == 0:
        #    print("[{}/{}] Tokenized the captions.".format(i + 1, len(captions)))

    if wiki_file is not None:
        with open(wiki_file) as f:
            # ファイルのサイズを使用しプログレスバーを作成
            for line in tqdm(f):
                tokens = tokenize(line)
                counter.update(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word("<pad>")
    vocab.add_word("<start>")
    vocab.add_word("<end>")
    vocab.add_word("<unk>")

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)

    return vocab
Пример #7
0
def parseFiles_important(html: str):
    '''creates and returns a list of tokens found in headers, bold, and emphasis'''
    soup = BeautifulSoup(html, "lxml")
    '''a list of head tag tokens (this one includes stop words)'''
    heads = soup.find_all(re.compile('head'))
    list_of_heads = tokenizer.tokenize(" ".join(
        [head.get_text() for head in heads]))
    '''a list of heading tag tokens'''
    headers = soup.find_all(re.compile('^h[1-6]$'))
    list_of_headers = tokenizer.tokenize_remove_stopwords(" ".join(
        [header.get_text() for header in headers]))
    '''a list of title tokens (this one includes stop words)'''
    titles = soup.find_all(re.compile('title'))
    list_of_titles = tokenizer.tokenize(" ".join(
        [title.get_text() for title in titles]))
    '''a list of bold'''
    bolds = soup.find_all(re.compile('b'))
    list_of_bolds = tokenizer.tokenize_remove_stopwords(" ".join(
        [bold.get_text() for bold in bolds]))
    '''a list of strong'''
    strongs = soup.find_all(re.compile('strong'))
    list_of_strongs = tokenizer.tokenize_remove_stopwords(" ".join(
        [strong.get_text() for strong in strongs]))
    '''a list of emphasis tags'''
    emphasis = soup.find_all(re.compile('em'))
    list_of_emphasis = tokenizer.tokenize_remove_stopwords(" ".join(
        [empha.get_text() for empha in emphasis]))

    return list_of_heads + list_of_headers + list_of_titles + list_of_bolds + list_of_strongs + list_of_emphasis
def calculate_wmd_scores(references, candidates, wmd_model):
    '''
    Calculate Word Mover's Distance for each (reference, candidate)
    pair in a list of reference texts and candidate texts.
    
    The lower the distance, the more similar the texts are.

    Parameters
    ----------
    references : list
        Input texts
    candidates : list
        Output texts (e.g. from a style transfer model)
    wmd_model : gensim.models.word2vec.Word2Vec
        Trained Word2Vec model
        
    Returns
    -------
    wmd_scores : list
        WMD scores for all pairs 

    '''

    wmd_scores = []

    for i in range(len(references)):
        wmd = wmd_model.wv.wmdistance(tokenize(references[i]),
                                      tokenize(candidates[i]))
        wmd_scores.append(wmd)

    return wmd_scores
def test_till_calculate_plagiarism_score():
    origin_text = 'the big cat is sleeping'
    susp_text = 'the cat is big'

    origin_tokens = tokenize(origin_text)
    susp_tokens = tokenize(susp_text)

    print(f'Raw text: {origin_text}')
    print(f'Tokenized text: {origin_tokens}\n\n')

    lcs_lenght = main.find_lcs_length(origin_tokens,
                                      susp_tokens,
                                      plagiarism_threshold=0.0)
    print('A length of the longest common subsequence for \n\n'
        f'{origin_text} \n\nand \n\n{susp_text}: \n\n{lcs_lenght} \n')

    matrix = main.fill_lcs_matrix(origin_tokens, susp_tokens)
    print('A matrix:')
    print(*matrix, sep='\n', end='\n\n')

    longest_lcs = main.find_lcs(origin_tokens, susp_tokens, matrix)
    print(f'The longest common subsequence: {longest_lcs}')

    score = main.calculate_plagiarism_score(lcs_lenght, susp_tokens)
    print(f'The plagiarism score: {score:.2f}\n')
    return score
Пример #10
0
def test_composite_phrases() -> None:
    s = "Orða- og tengingasetning."
    #    0123456789012345678901234
    #    ^   ^^  ^               ^
    #      x
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 4, 5, 8, 24]
    assert byte_indexes == [0, 5, 6, 9, 25]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 4, 5, 8, 24, 25]
    assert byte_indexes == [0, 5, 6, 9, 25, 26]

    # The whole thing gets squished together into a single token.
    s = "Orða- og tengingasetning."
    #    0123456789012345678901234
    #    ^                       ^
    #      x
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 24]
    assert byte_indexes == [0, 25]
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 24, 25]
    assert byte_indexes == [0, 25, 26]
Пример #11
0
    def _clean(self, source, target, source_cleaned, target_cleaned, m):
        self.info('Cleaning...')
        source_in = codecs.open(source, 'rb', 'utf-8')
        target_in = codecs.open(target, 'rb', 'utf-8')
        source_out = codecs.open(source_cleaned, 'wb', 'utf-8')
        target_out = codecs.open(target_cleaned, 'wb', 'utf-8')

        for num_lines, _ in enumerate(source_in): pass
        source_in.seek(0, 0)

        pbar = ProgressBar(maxval=num_lines).start()
        for l in count(0):
            source_line = source_in.readline()
            target_line = target_in.readline()

            if not source_line or not target_line:
                break

            source_tokens = tokenize(source_line)
            target_tokens = tokenize(target_line)

            if len(source_tokens) == 0 or len(source_tokens) > m \
                    or len(target_tokens) == 0 or len(target_tokens) > m:
                continue

            source_out.write(' '.join(source_tokens) + '\n')
            target_out.write(' '.join(target_tokens) + '\n')
            pbar.update(l)
        pbar.finish()

        source_in.close()
        target_in.close()
        source_out.close()
        target_out.close()
def common_words(tweet1, tweet2):
    tweet_1 = tk.tokenize(tweet1)
    tweet_2 = tk.tokenize(tweet2)
    counts = 0
    for i in tweet_1:
        if i in tweet_2:
            counts += 1
    return counts
Пример #13
0
 def test_slash_within_paren_works(self):
     tks = tk.tokenize('(my/example)')
     self.assertEqual(len(tks), 1)
     self.assertEqual(tks[0][0], ChunkType.Paren)
     tks = tk.tokenize('(my/example)', parse_slash=True)
     self.assertEqual(len(tks), 1,
                      "expected one element, got: " + repr(tks))
     self.assertEqual(tks[0][0], ChunkType.Paren)
Пример #14
0
def get_msr_feats(corpus):
    feats1 = []
    feats2 = []
    for sample in corpus:
        words1 = [word.lower() for word in tokenize(sample[1])]
        words2 = [word.lower() for word in tokenize(sample[2])]
        feats1.append([words1])
        feats2.append([words2])
    return feats1, feats2
Пример #15
0
 def test_enclosing_chars_have_precedence_over_delimiters(self):
     tks = tk.tokenize('(a,b;c|d/e) {a,b;c|d/e} [a,b;c|d/e]')
     self.assertEqual(len(tks), 3)
     for chunk in tks:
         self.assertEqual(chunk[1], 'a,b;c|d/e')
     # test slash
     tks = tk.tokenize('x /a,b;c|d/ y', parse_slash=True)
     self.assertEqual(len(tks), 3)
     self.assertEqual(tks[1][1], 'a,b;c|d')
Пример #16
0
 def test_only_expressions_with_no_spaces_withing_slash_slash_parsed(self):
     print("===")
     tks = tk.tokenize('/AB/', parse_slash=True)
     print("---")
     self.assertEqual(len(tks), 1)
     self.assertEqual(tks[0][0], ChunkType.Slash)
     tks = tk.tokenize('A / B/', parse_slash=True)
     self.assertEqual(len(tks), 1)
     self.assertEqual(tks[0][0], ChunkType.Word)
def get_msr_feats(corpus):
    feats1 = []
    feats2 = []
    for sample in corpus:
        words1 = [word.lower() for word in tokenize(sample[1])]
        words2 = [word.lower() for word in tokenize(sample[2])]
        feats1.append([words1])
        feats2.append([words2])
    return feats1, feats2
Пример #18
0
def get_tokens(obj):
    if isinstance(obj, basestring):
        return tokenize(obj)
    elif isinstance(obj, file):
        return tokenize(obj)
    else:
        # object not valid
        raise TypeError('Got unexpected object type {0!r}'.format(
            obj.__class__.__name__))
Пример #19
0
 def get_kb_numbers(self, kb):
     title = tokenize(re.sub(r'[^\w0-9\.,]', ' ', kb.facts['item']['Title']))
     description = tokenize(re.sub(r'[^\w0-9\.,]', ' ', ' '.join(kb.facts['item']['Description'])))
     numbers = set()
     for token in chain(title, description):
         try:
             numbers.add(float(self.process_string(token)))
         except ValueError:
             continue
     return numbers
Пример #20
0
 def test_make_choosable_tokens(self):
     src = "a = 0 /* @ 1, 2, 3 */"
     tokens = tokenize(src)
     choosable_tokens = make_choosable_tokens(tokens)
     self.assertEqual(choosable_tokens, [
                      "a", " ", "=", " ", ["1", "2", "3"]])
     src = "a = 0 /* @ 1, 2, 3 */0/* @ 1, 2, 3 */"
     tokens = tokenize(src)
     choosable_tokens = make_choosable_tokens(tokens)
     self.assertEqual(choosable_tokens, [
                      "a", " ", "=", " ", ["1", "2", "3"], ["1", "2", "3"]])
Пример #21
0
 def test_generate_every_possible_code(self):
     src = "a = 3 /* @ 1, 2, 3 */"
     possible_codes = generate_every_possible_code(
         make_choosable_tokens(tokenize(src)))
     self.assertEqual(possible_codes, ["a = 1", "a = 2", "a = 3"])
     src = "a = 3 /* @ 1, 2, 3 */0/* @ 1, 2, 3 */"
     possible_codes = generate_every_possible_code(
         make_choosable_tokens(tokenize(src)))
     self.assertEqual(possible_codes, [
         "a = 11", "a = 12", "a = 13", "a = 21", "a = 22", "a = 23",
         "a = 31", "a = 32", "a = 33"
     ])
Пример #22
0
def test_iterator_cases() -> None:
    s = [
        "Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ",
        "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."
    ]
    # (char and byte indexes in a similar test above)
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [
        0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72
    ]
    assert byte_indexes == [
        0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78
    ]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [
        0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73
    ]
    assert byte_indexes == [
        0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79
    ]

    s = ["Stutt setning.", "", "Önnur setning."]
    #     01234567890123        45678901234567
    #     ^    ^       ^        ^    ^       ^
    #                           x
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 5, 13, 14, 19, 27]
    assert byte_indexes == [0, 5, 13, 14, 20, 28]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 5, 13, 14, 19, 27, 28]
    assert byte_indexes == [0, 5, 13, 14, 20, 28, 29]

    # parse_tokens does some implentation-detail-stuff here. Use tokenize instead.
    s = [" Stutt setning. ", "\n \n", "Önnur setning."]
    #     0123456789012345    6 78     90123456789012
    #     ^     ^       ^^                  ^       ^
    #                                  x
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 6, 14, 15, 24, 32]
    assert byte_indexes == [0, 6, 14, 15, 25, 33]
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 6, 14, 15, 24, 32, 33]
    assert byte_indexes == [0, 6, 14, 15, 25, 33, 34]
Пример #23
0
    def iter_sents(ets, total):
        for et in tools.tqdm(ets, total=total):
            name = tokenize(et['name'])
            yield tools.replace_num(name.split())

            desc = et.get('description')
            if desc:
                desc = tokenize(desc)
                yield tools.replace_num(desc.split())

            for syn in et.get('synonyms', []) + et.get('mistypes', []):
                sname = tokenize(syn['name'])
                yield tools.replace_num(sname.split())
Пример #24
0
def build_canonical_line(vocab, cols):
    if cols[1].strip() == "not_entailment":
        return None
    label = cols[1].strip()
    if label in label_dict:
        label = label_dict[label]
    else:
        label = float(label)

    data = {"uid": cols[0].strip(), "label": label}
    if len(cols) == 3:
        token_id = []
        type_id = []
        seq1s = tokenizer.tokenize(cols[2].lower())
        token_id.append(vocab["<cls>"])
        for seq in seq1s:
            if seq in vocab:
                token_id.append(vocab[seq])
            else:
                token_id.append(vocab["<unk>"])
        token_id.append(vocab["<sep>"])
        type_id.extend([0] * (len(seq1s) + 2))
        data["token_id"] = token_id
        data["type_id"] = type_id
    elif len(cols) == 4:
        token_id = []
        type_id = []
        seq1s = tokenizer.tokenize(cols[2].lower())
        seq2s = tokenizer.tokenize(cols[3].lower())
        token_id.append(vocab["<cls>"])
        for seq in seq1s:
            if seq in vocab:
                token_id.append(vocab[seq])
            else:
                token_id.append(vocab["<unk>"])
        token_id.append(vocab["<sep>"])
        for seq in seq2s:
            if seq in vocab:
                token_id.append(vocab[seq])
            else:
                token_id.append(vocab["<unk>"])
        token_id.append(vocab["<sep>"])
        type_id.extend([0] * (len(seq1s) + 2))
        type_id.extend([1] * (len(seq2s) + 1))
        data["token_id"] = token_id
        data["type_id"] = type_id
    else:
        print(cols)
        return None
    return data
Пример #25
0
def min_word_count(ex):
    try:
        isl_toks = [
            tok for tok in tokenizer.tokenize(ex["is"])
            if tok.txt is not None and tok.kind == tokenizer.TOK.WORD
        ]
        eng_toks = [
            tok for tok in tokenizer.tokenize(ex["en"])
            if tok.txt is not None and tok.kind == tokenizer.TOK.WORD
        ]
    except TypeError as e:
        return True
    return len(isl_toks) >= DEFAULT_MIN_WORD_COUNT and len(
        eng_toks) >= DEFAULT_MIN_WORD_COUNT
Пример #26
0
def process_query(query_words):
    query_type = query_words[:1]
    query_words = query_words[1:]
    if query_type == "1":
        # call tokenizer
        query_words = tokenizer.tokenize(query_words.replace("\n", " "))
        # call stemmer(stem tokens)
        query_words = tokenizer.stem(query_words)
        # take set of query words to remove duplicates
        docs = conjunctive_matcher(
            list(sorted(set(query_words), key=query_words.index)))
        open("results.txt", "a").write(str(docs) + "\n")
    elif query_type == "2":
        # call tokenizer
        query_words = tokenizer.tokenize(query_words.replace("\n", " "))
        # call stemmer(stem tokens)
        query_words = tokenizer.stem(query_words)
        docs = phrase_matcher(query_words)
        open("results.txt", "a").write(str(docs) + "\n")
    elif query_type == "3":
        # call tokenizer
        query_words = tokenizer.tokenize(query_words.replace("\n", " "))
        # call stemmer(stem tokens)
        query_words = tokenizer.stem(query_words)

        # maps proximity index to proximity value e.g( 0 => /3 means proximity value after 0.th word )
        proximity_index_dict = {}
        proximity_index = 0
        for word in query_words:
            match = re.match("^\/\d+$", word)
            if match:
                proximity_index_dict.update({
                    (proximity_index - 1):
                    int(match.group(0).replace("/", ""))
                })
            else:
                proximity_index += 1

        query_words = [
            word for word in query_words if not re.match("^\/\d+$", word)
        ]
        for i in range(0, len(query_words)):
            if i not in proximity_index_dict.keys():
                proximity_index_dict.update({i: 0})

        docs = proximity_matcher(query_words, proximity_index_dict)
        open("results.txt", "a").write(str(docs) + "\n")

    else:
        print("Unsupported query type is given!!!")
Пример #27
0
def test_converted_measurements() -> None:
    s = "Stillið ofninn á 12° C til að baka kökuna."
    #    012345678901234567890123456789012345678901
    #    ^      ^      ^ ^     ^   ^  ^    ^      ^
    #          x        x   x        x       x
    toks = tokenizer.tokenize(s, convert_measurements=True)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 7, 14, 16, 22, 26, 29, 34, 41]
    assert byte_indexes == [0, 8, 15, 18, 25, 29, 33, 38, 46]
    toks = tokenizer.tokenize(s, convert_measurements=True)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 7, 14, 16, 22, 26, 29, 34, 41, 42]
    assert byte_indexes == [0, 8, 15, 18, 25, 29, 33, 38, 46, 47]
Пример #28
0
def export_data(lines):
    """ Parse "raw" ingredient lines into CRF-ready output """
    output = []
    for line in lines:
        line_clean = re.sub('<[^<]+?>', '', line)
        tokens = tokenizer.tokenize(line_clean)
        # Need a copy, otherwise somehow it gets used up
        tokens_copy = tokenizer.tokenize(line_clean)

        for i, token in enumerate(tokens_copy):
            features = getFeatures(token, i + 1, tokens)
            output.append(joinLine([token] + features))
        output.append('')
    return '\n'.join(output)
Пример #29
0
    def extract_answer(self,q_text,e_texts,word2id,char2id,maxlen=10,threshold=0.1):
        Qc,Qw,Ec,Ew= [],[],[],[]
        qc = list(q_text)
        Qc,q_mask=sent2id([qc],char2id)

        qw = alignWord2Char(tokenize(q_text))
        Qw,q_mask_=sent2id([qw],word2id)

        assert torch.all(q_mask == q_mask_)

        tmp = [(list(e),alignWord2Char(tokenize(e))) for e in e_texts]
        ec,ew = zip(*tmp)

        Ec,e_mask=sent2id(list(ec),char2id)
        Ew,e_mask_=sent2id(list(ew),word2id)
        assert torch.all(e_mask == e_mask_)

        totensor=lambda x: torch.from_numpy(np.array(x)).long()

        L=[Qc,Qw,q_mask,Ec,Ew,e_mask]
        L=[totensor(x) for x in L]

        As_ , Ae_ = self.best_model(L)

        R={}
        for as_ ,ae_ , e in zip(As_,Ae_,e_texts):
            as_ ,ae_ = as_[:len(e)].numpy() , ae_[:len(e)].numpy()
            sidx = torch.where(as_>threshold)[0]
            eidx = torch.where(ae_>threshold)[0]
            result = { }
            for i in sidx:
                cond = (eidx >= i) & (eidx < i+maxlen)
                for j in eidx[cond]:
                    key=e[i:j+1]
                    result[key]=max(result.get(key,0),as_[i] * ae_[j])
            if result:
                for k,v in result.items():
                    if k not in R:
                        R[k]=[]
                    R[k].append(v)        
        # sort all answer
        R= [
            [k,((np.array(v)**2).sum()/(sum(v)+1))]
            for k , v in R.items()
        ]

        R.sort(key=lambda x: x[1], reversed=True)
        # R 降序排列的 (answer, possibility)
        return R
Пример #30
0
def min_word_count(ex):
    isl_toks = [
        tok
        for tok in tokenizer.tokenize(ex["is"])
        if tok.txt is not None and tok.kind == tokenizer.TOK.WORD
    ]
    eng_toks = [
        tok
        for tok in tokenizer.tokenize(ex["en"])
        if tok.txt is not None and tok.kind == tokenizer.TOK.WORD
    ]
    return (
        len(isl_toks) >= DEFAULT_MIN_WORD_COUNT
        and len(eng_toks) >= DEFAULT_MIN_WORD_COUNT
    )
Пример #31
0
    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
Пример #32
0
 def test_not(self):
     test_str = "(not (and true false))"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
Пример #33
0
def build_index(docs):
    """VOTRE CODE ICI

       A partir de la collection des documents, construisez une structure
       des donnees qui vous permettra d'identifier des documents pertinents
       pour une question (e.g., l'index inversee qu'on a vu en classe).
    """
    # Initialize index (empty list)
    index = {}

    print("Build index ... ")
    # Loop for all documents: 1400
    for docID in docs:
        # Get frequencies for document number docID
        freqs = frequencies(tokenize(docs[docID]))
        # For each word in this document
        for word in freqs:
            # If word is not in our index
            if word not in index.keys():
                # Add a new entry
                index[word] = []
            # In all case, add a new value
            index[word].append((docID, freqs[word]))
        if docID % 140 == 0:
            percent = docID / 14
            print(str(percent) + "%")

    print("Build index : Done")
    return index
Пример #34
0
 def test_tokenize_quoted_string(self):
     input = 'name = "value one"'
     expected_output = ['name', '=', 'value one']
     output = tokenizer.tokenize(input)
     for element_position in range(0, len(output)):
         self.assertTrue(
             output[element_position] == expected_output[element_position])
Пример #35
0
def main(argv = sys.argv):

    argv = argv[1:]
    
    if len(argv) and argv[0] == "--arabic":
        argv = argv[1:]
        lang = arabic.ArabicModule()
        known_words = read_known_file.read_known_file("arabic_known_words.txt", lang) | read_known_file.read_known_file("arabic_ignore_list.txt", lang)
    elif len(argv) and argv[0] == "--turkish":
        argv = argv[1:]
        lang = turkish.TurkishModule()
        known_words = read_known_file.read_known_file("turkish_known_words.txt", lang) | read_known_file.read_known_file("turkish_ignore_list.txt", lang)
    else:
        lang = french.FrenchModule()
        known_words = read_known_file.read_known_file("french_known_words.txt", lang) | read_known_file.read_known_file("french_ignore_list.txt", lang)
        

    uw = uniquify.UniqueWords()
    for fname in argv:
    
        data = filereader.read_file(fname)
        tokens = tokenizer.tokenize(data)
    
        uw.uniquify(tokens, lang)

    uw.weed_out_uninteresting_words(lang)

    uniquify.rank(uw, lang, known_words)
Пример #36
0
 def extractCoordinates(self):
     self.inputfile = open(self.ifilename, "r") 
     line = self.inputfile.readline()
     coords_times_list = []
     i = 0
     while len(line) > 0:
         i = i + 1
         #print i
         try:
             tweet = jsonpickle.decode(line)
         except ValueError, e:
             print repr(e)
             line = self.inputfile.readline()
             continue
         if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"):
             print "unimplemented data item"
         else:
             #print tweet["text"]
             text = tweet["text"]
             tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
             tokens = tokenizer.tokenize(text)
             if tweet.has_key("coordinates"):
                 coord = tweet["coordinates"]
                 if coord == None:
                     print "coordinates null"
                 elif coord.has_key("type") and coord["type"] == "Point":
                     coords_times_list.append([coord["coordinates"], tweet_w])
                 else:
                     print "not a point"
         line = self.inputfile.readline()
Пример #37
0
	def compute_ave_words_in_sentence(self):
		sentences = tokenizer.split_sentence(self.text)
		average = 0
		for sentence in sentences:
			average += len(tokenizer.tokenize(sentence))
		self.ave_words_in_sentence = 1.0 * average / len(sentences)
		return self.ave_words_in_sentence
Пример #38
0
def main():

    try:
        settings = open('settings.cfg', 'r').read()
    except IOError as e:
        print "Do you have your settings entered correctly?"
        exit(1)

    settings_obj = yaml.load(settings)

    input_path = os.path.abspath(settings_obj['in'])
    output_path = os.path.abspath(settings_obj['out'])
    extensions = settings_obj['extensions']


    main_template = os.path.join(input_path, 'templates/main.html')
    try:
        main_template_html = open(main_template, 'r').read()
    except IOError as e:
        import html_templates
        main_template_html = html_templates.basic

    posts = os.path.join(input_path, 'posts/')
    for post in os.listdir(posts):
        if any([post.endswith(x) for x in extensions]):
            post_path = os.path.join(os.path.dirname(posts), post)
            tokens = tokenizer.tokenize(open(post_path, 'r').read())
            parsed_post = parser.parse(tokens)
            parsed_page = main_template_html.safe_substitute(title=post, body=parsed_post)
            make_html_output(output_path, post, parsed_page)
Пример #39
0
def query(query, offset, rpp):

    # Load the indexed data
    ids = pickle.load(open(config.data_directory + '/monuments.ids', 'r'))
    dictionary = corpora.Dictionary.load(config.data_directory + '/monuments.dict')
    corpus = corpora.MmCorpus(config.data_directory + '/monuments.mm') 
    lsi = models.LsiModel.load(config.data_directory + '/monuments.lsi')
    tfidf = models.TfidfModel.load(config.data_directory + '/monuments.tfidf')
    tfidfIndex = similarities.Similarity.load(config.data_directory + '/monuments.tfidf.index')
    lsiIndex = similarities.Similarity.load(config.data_directory + '/monuments.lsi.index')

    # Convert query to a tokenized document and project it as a vector in tfidf and lsi
    tokenized = tokenizer.tokenize(query)
    vector = dictionary.doc2bow(tokenized)
    tfidf_vector = tfidf[vector]
    lsi_vector = lsi[vector]
    
    # Determine how similar the query vector is to the other documents in
    # the same spaces (tfidf and lsi), and select the most similar documents
    tfidf_similarity = tfidfIndex[tfidf_vector]
    lsi_similarity = lsiIndex[lsi_vector]
    similarity = np.array(lsi_similarity) * np.array(tfidf_similarity)
    similarity = sorted(enumerate(similarity), key=lambda item: -item[1])
    sims = similarity
    sims = [s for s in sims if s[1] > 0]
    offset = int(min(offset, len(sims)))
    results = [str(ids[sim[0]]) for sim in sims[offset:int(min(offset+rpp, len(sims)))]]
    
    # Print json result
    print json.dumps({
        'nrOfResults': len(sims),
        'startResult': offset,
        'endResult': min(offset+rpp, len(sims)),
        'results': results})
Пример #40
0
def getScore(newtitle):
    query = tokenizer.tokenize(newtitle)
    res = idx.queryVector(query, 1)
    #print("{0} results.".format(len(res)))
    
    # Take average of (upvotes-downvotes) weighted by similarity score ^ 2
    # but only for posts with simscore > max(simscore)/2
    totalweight = 0.0
    totalscore = 0.0
    for n in res:
        simscore = n[1]
        simscore = simscore ** 0.5
        #if simscore < 0.75:
        #    continue

        post = postdata.posts[n[0]]
        #score = post["day"][1] - post["day"][2] # ups - downs^2
        #score = post["day"][1]
        #score = post["day"][1] + post["num_comments"]
        score = post["day"][1] - post["day"][2] + post["num_comments"]*2.5

        totalscore += float(score) * simscore
        totalweight += simscore

        #return float(score) # test

    if totalweight == 0:
        return 0.0 # couldn't make a score for this

    finalscore = (totalscore / totalweight)

    return finalscore
Пример #41
0
    def guess(self, text):
        doc_counts = {}
        doc_inverse_counts = {}
        tokens = tokenize(text)
        scores = {}
        for label in self.labels:
            doc_counts[label] = self.doc_count(label)
            doc_inverse_counts[label] = self.doc_inverse_count(label)
            total = self.total_doc_count()
        for label in self.labels:
            logSum = 0.0
            for word in tokens:
                stem_total_count = self.stem_total_count(word)
                if stem_total_count == 0.0:
                    continue
                else:
                    word_prob = self.stem_label_count(label, word) / doc_counts[label]
                    word_inverse_prob = self.stem_inverse_label_count(label, word) / doc_inverse_counts[label]
                    wordicity = word_prob / (word_prob + word_inverse_prob)

                    wordicity = (( 1.0 * 0.5) + (stem_total_count * wordicity) ) / (1.0 + stem_total_count )
                    if wordicity == 0.0:
                        wordicity = 0.01
                    elif wordicity == 1:
                        wordicity = 0.99
                try:
                    logSum += math.log(1.0 - wordicity) - math.log(wordicity)
                except ValueError:
                    print "ValueError"
            try:
                scores[label] = 1.0 / (1.0 + math.exp(logSum))
            except OverflowError:
                print "OverflowError"
        return scores
Пример #42
0
 def test_while(self):
     test_str = "(while (< 3 i) (assign i (+ i 1)))"
     expected_tokens = [Token("(", "L_PAREN"),
                        Token("while", "E_WHILE"),
                        Token("(", "L_PAREN"),
                        Token("<", "O_LT"),
                        Token("3", "V_INT"),
                        Token("i", "V_STRING"),
                        Token(")", "R_PAREN"),
                        Token("(", "L_PAREN"),
                        Token("assign", "E_ASSIGN"),
                        Token("i", "V_STRING"),
                        Token("(", "L_PAREN"),
                        Token("+", "O_ADD"),
                        Token("i", "V_STRING"),
                        Token("1", "V_INT"),
                        Token(")", "R_PAREN"),
                        Token(")", "R_PAREN"),
                        Token(")", "R_PAREN")]
     actual_tokens = tokenize(test_str)
     for actual, expected in izip(actual_tokens, expected_tokens):
         self.assertEqual(actual, expected)
     consumed, remaining = S(actual_tokens)
     self.assertEqual(remaining, [])
     if not remaining:
         print "accepted"
         print consumed
Пример #43
0
def rank_docs(index, query):
    """VOTRE CODE ICI

       Retournez la serie des docIDs ordonner par leur pertinence vis-a-vis
       la question 'query'.
    """
    # Initialize new list
    ranking = {}
    for i in range(1, 1401):
        ranking[i] = 0

    # For each word in query
    for word in tokenize(query):
        # If we have this word in our index
        if word in index.keys():
            # For each document in which we can find the word
            for item in index[word]:
                # Increase the score
                ranking[item[0]] += item[1]

    # Sort the list with score
    sorted_ranking = sorted(ranking.items(), key=operator.itemgetter(1), reverse=True)

    ranking = []
    for couple in sorted_ranking:
        ranking.append(couple[0])
        
    return ranking
Пример #44
0
    def build_lattice(self, pt, sentence):
        '''
        Gets a phrase table and the tokenized sentence and outputs a lattice
        file formatted as follows:
            whole sentence
            1-1:
            <English translation> <Translation score>
            <English translation> <Translation score>
            ...
            1-2:
            <English translation> <Translation score>
            <English translation> <Translation score>
            ...
            2-2:

        The spans n-n refer to the tokens of the input Spanish sentence
        '''
        sentence = tokenize(sentence)
        self.sentence = sentence
        for start in xrange(len(sentence)):
            self.phrases[start] = {}
            for end in xrange(start+1, len(sentence)+1):
                foreign = sentence[start:end]
                p = Phrase(foreign, start, end)
                if len(foreign) == 1 and foreign[0] == ',':
                    p.translations = [Translation(foreign, (',',), 0)]
                else:
                    p.translations = pt.translate(foreign)
                self.phrases[start][end] = p
Пример #45
0
 def _parse(self, path, content, addWords):
    words = tokenizer.tokenize(path, content)
    wordList = []
    currNode = ParseNode(path, 0, None)
    currLine = [0, currNode]
    nodeId = 1
    for token, start, type in words:
       if type == tokenizer.NOTHING:
          if addWords:
             self.words.add(token)
          wordList.append((token, start, currLine))
       elif type == tokenizer.NEWLINE:
          wordList.append(('\\n', start, currLine))
          prevLine = currLine
          currLine = [currLine[0]+1, currNode]
       elif type == tokenizer.DEDENT:
          wordList.append(('\\d', start, currLine))
          currNode = currNode.parent
          currLine[1] = currNode
       elif type == tokenizer.INDENT:
          wordList.append(('\\i', start, currLine))
          currNode = ParseNode(path, nodeId, currNode)
          nodeId += 1
          prevLine[1] = currNode
          currLine[1] = currNode
    if len(wordList) == 0:
       wordList.append(('\\n', 0, currLine))
    return wordList
Пример #46
0
 def processFile(self):
     self.inputfile = open(self.ifilename, "r") 
     line = self.inputfile.readline()
     i = 0
     while len(line) > 0:
         i = i + 1
         #print i
         try:
             tweet = jsonpickle.decode(line)
         except ValueError, e:
             print repr(e)
             line = self.inputfile.readline()
             continue
         if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"):
             print "unimplemented data item"
         else:
             #print tweet["text"]
             text = tweet["text"]
             tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
             tokens = tokenizer.tokenize(text)
             #print tokens
             #print tokens[0]
             self.countTokens(tokens)
             self.recordTokens(tokens, tweet_w)
         line = self.inputfile.readline()
Пример #47
0
def indexDir(dirname):
  basename = os.path.basename(dirname.rstrip("/"))
  indexFile = open('./indexes/%s_index' % basename, 'w');

  idMap = {}
  indexDict = {}
  docId = 0
  for (root, dirnames, filenames) in os.walk(dirname):
    for filename in filenames:
      if (re.search("\.sw[op]$", filename) == None):
        with open(os.path.join(root, filename), 'r') as fh:
          idMap[docId] = filename

          tokens = tokenize(fh)
          for (pos, token) in tokens:
            token = stem(alias(token))
            try:
              positionMap = indexDict[token]
              try:
                positionMap[docId].append(pos)
              except KeyError:
                positionMap[docId] = [pos]
            except KeyError:
              indexDict[token] = {docId: [pos]}

          docId += 1

  fullIndex = {"id_map" : idMap, "index" : indexDict}
  indexFile.write(json.dumps(fullIndex))
  indexFile.close()
Пример #48
0
def _phrase_search(user, query):
    n = normalize(query)
    keywords = tokenize(n)
    logging.info('phrase_search: query: '+query)
    logging.info('n: '+n)
    logging.info('keywords:' + str(keywords))
    if not len(keywords):
        return []

    logging.info('%d - %s' % (0, keywords[0]));
    results = _lookup(user, keywords[0])
    if not results:
        return []
    logging.info('%s' % str(results));
    for i in range(1, len(keywords)):
        logging.info('%d - %s' % (i, keywords[i]));
        id_pos_dict = _lookup(user, keywords[i])
        logging.info('%s' % str(id_pos_dict));
        if id_pos_dict:
            for id in results.keys():
                if id not in id_pos_dict:
                    del results[id]
                else:
                    poses = []
                    for pos in id_pos_dict[id]:
                        if pos - 1 in results[id]:
                            poses.append(pos)
                    if not len(poses):
                        del results[id]
                    else:
                        results[id] = poses
        else:
            return []
    return results.keys()
Пример #49
0
def parse_paragraph(parag, mim_tags, fast_p):
    """ Parse a single paragraph in free text form and compare to MIM POS tags """

    tokens = tokenize(parag)
    tlist = list(tokens)
    result = parse_tokens(tlist, mim_tags, fast_p)
    print("{0}\n--> {1} sentences, {2} parsed".format(parag, result["num_sent"], result["num_parsed_sent"]))
Пример #50
0
    def test_simple_file(self):
        input = """#include GLFW_INCLUDE_GLU
                   #include <GLFW/glfw3.h>
                   #include <cstdio>
                   
                   /* Random function */
                   static void glfw_key_callback(int key, int scancode, int action, int mod){
                     if(glfw_key_callback){
                       // Comment here
                       input_event_queue->push(inputaction);   
                     }
                   }"""
        (final_stats, final_tokens, file_times) = tokenizer.tokenize(input, comment_inline_pattern, comment_open_close_pattern, separators)
        (file_hash,lines,LOC,SLOC) = final_stats
        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens

        self.assertEqual(lines,11)
        self.assertEqual(LOC,10)
        self.assertEqual(SLOC,8)

        self.assertEqual(tokens_count_total,24)
        self.assertEqual(tokens_count_unique,18)
        self.assert_common_properties(tokens)

        hard_tokens = set(['int@@::@@4','void@@::@@1','cstdio@@::@@1','action@@::@@1','static@@::@@1','key@@::@@1','glfw_key_callback@@::@@1','mod@@::@@1','if@@::@@1','glfw3@@::@@1','scancode@@::@@1','h@@::@@1','GLFW_INCLUDE_GLU@@::@@1','input_event_queue@@::@@2','GLFW@@::@@1','push@@::@@1','inputaction@@::@@1','include@@::@@3'])
        this_tokens = set(tokens[3:].split(','))
        self.assertTrue(len(hard_tokens - this_tokens),0)

        m = hashlib.md5()
        m.update(tokens[3:])
        self.assertEqual(m.hexdigest(),token_hash)
Пример #51
0
	def __init__(self, source):
		self.numTemps = 0
		self.macros = []
		self.mlMacros = []
		for mem in dir(self):
			mem = getattr(self, mem)
			if isinstance(mem, type) and issubclass(mem, Macro):
				if issubclass(mem, MLMacro):
					self.mlMacros.append(mem(self))
				else:
					self.macros.append(mem(self))
		self.macros.sort()
		self.mlMacros.sort()
		
		tokens = tokenizer.tokenize(source)
		pprint.pprint(tokens)
		
		code = self.compile(tokens)
		pprint.pprint(code)
		
		code = Module(
				None,
				Stmt(code)
			)
		
		set_filename('<macropy>', code)
		self.compiled = ModuleCodeGenerator(code).getCode()
Пример #52
0
 def test_math2(self):
     test_str = "(* 34 (- 23 45))"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
Пример #53
0
 def test_sin(self):
     test_str = "(sin 3.2)"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
Пример #54
0
 def test_lt_gt(self):
     test_str = "(and (< 3 5) true)"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
Пример #55
0
def compute(topic):
    raw, ref = get_article(topic)

    sent = tokenize(raw)

    df = pd.DataFrame()

    ratio = len(ref) / len(raw)

    # TextRank
    result = text_rank(raw, sent, ref)

    r = Rouge()
    rouge = r.get_scores(result, ref)

    df = df.append(gen_serie('TextRank', rouge, result), ignore_index=True)

    # Gensim
    ret = summarize(raw, ratio)
    r = Rouge()
    rouge = r.get_scores(ret, ref)
    df = df.append(gen_serie('Gensim', rouge, ret), ignore_index=True)

    # KMean
    df = df.append(kmean(sent, ret))

    # Cosine
    df = df.append(cosine(sent, ref), ignore_index=True)

    # Rearrange columns
    df = df[columns]

    df.to_csv('out/' + topic + '.csv')

    return df.to_json(orient='records')
Пример #56
0
 def test_bool_value(self):
     test_str = "(iff (and (or true false) true) true)"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
Пример #57
0
 def test_tokenize_multi_propety(self):
     input = "name = {\n\tvalue1 value2\n}"
     expected_output = ['name', '=', '{', 'value1', 'value2', '}']
     output = tokenizer.tokenize(input)
     for element_position in range(0, len(output)):
         self.assertTrue(
             output[element_position] == expected_output[element_position])
Пример #58
0
 def test_math1(self):
     test_str = "(+ (- 234.3 1.1) 23)"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
Пример #59
0
 def test_tokenize_single_property(self):
     input = "name = value"
     expected_output = ['name', '=', 'value']
     output = tokenizer.tokenize(input)
     for element_position in range(0, len(output)):
         self.assertTrue(
             output[element_position] == expected_output[element_position])
Пример #60
0
def generate_model(input_filename, output_filename=None):
    if output_filename is None:
        input_file, input_ext = os.path.splitext(input_filename)
        output_filename = input_file + '.dat'

    model = {
        'id2word':          [],
        'word2id':          {},
        'wordgrams':        [],
        'normalizedgrams':  [],
        'words_count':      0,
        'normalized_count': 0,
        'words_sum':        0,

        'coefficients':     {
            'wordgrams':        [1] * WORDGRAMS_SIZE,
            'normalizedgrams':  [1] * NORMALIZEDGRAMS_SIZE,
        },

        'unknown':          0.0,
    }

    # READ TOKENS
    with open(input_filename, 'r') as _file:
        tokens = tokenizer.tokenize(_file.read())

    print('##', input_filename)
    make_dictionary(tokens, model)
    gather_wordgrams(tokens, model)
    gather_normalizedgrams(tokens, model)
    calculate_unknown(tokens, model)
    calculate_coefficients(model)

    save_model(model, output_filename)