예제 #1
0
def test_keywords_only():
    A = py_aho_corasick.Automaton(['cash', 'shew', 'ew'])
    text = "cashew"
    keywords = A.get_keywords_found(text)
    for idx,k,v in keywords:
        assert text[idx:idx+len(k)] == k
    assert len(keywords) == 3
예제 #2
0
def wc_by_ner(file_name):
    entities = [(u'SK텔레콤', u'COMPANY'), (u'5G', u'TERM')]
    with open("wiki_entity_100000.txt", "r") as f:
        for line in f.readlines():
            line = line.decode('utf-8').strip()
            sp = line.split("\t")
            word = sp[0]
            tag = sp[1]
            # print word
            t = (word, tag)
            entities.append(t)

    B = py_aho_corasick.Automaton(entities)
    dict = {}
    with open(file_name) as f:
        for line in f.readlines():
            line_entity = []
            line_word = []
            line_concat = line.decode("utf-8").replace(" ", "")
            for idx, k, v in B.get_keywords_found(line_concat):
                # print idx, k, v
                line_entity.append(k.upper())

            result = mecab.pos(line.decode("utf-8").strip())
            for r in result:
                flag = False
                word = r[0].upper()
                tag = r[1]
                if tag not in tag_list: continue
                if len(word) < 1: continue
                for entity in line_entity:
                    if word in entity:
                        flag = True
                        continue

                if (flag == False):
                    line_word.append(word)
                    # print word

            for word in line_entity:
                word = word.upper()
                if word not in dict:
                    dict[word] = 1
                else:
                    dict[word] = dict[word] + 1

            for word in line_word:
                word = word.upper()
                if word not in dict:
                    dict[word] = 1
                else:
                    dict[word] = dict[word] + 1

    wordcloud = WordCloud(background_color="white", font_path=path.join(d + 'fonts', "NanumMyeongjo.ttf"))
    wordcloud.fit_words(dict)

    wordcloud.to_file(path.join(d, "wc.png"))
    utils.send_image(path.join(d, "wc.png"))
예제 #3
0
def test_utf8():
    kv = [(u'哈哈'.encode('utf8'),1), (u'你好'.encode('utf8'),2), (u'算我shu'.encode('utf8'),3)]
    kv_dict = dict(kv)
    A = py_aho_corasick.Automaton(kv)
    text = u'你好哈哈算我shu咯'.encode('utf8')
    keywords = A.get_keywords_found(text)
    for idx,k,v in keywords:
        assert text[idx:idx+len(k)] == k
        assert v == kv_dict[k]
    assert len(keywords) == 3
예제 #4
0
def test_keywords_and_values():
    kv = [('cash',1), ('shew',2), ('ew',3)]
    kv_dict = dict(kv)
    A = py_aho_corasick.Automaton(kv)
    text = "cashew"
    keywords = A.get_keywords_found(text)
    for idx,k,v in keywords:
        assert text[idx:idx+len(k)] == k
        assert v == kv_dict[k]
    assert len(keywords) == 3
예제 #5
0
def test_pickle():
    kv = [('cash',1), ('shew',2), ('ew',3)]
    kv_dict = dict(kv)
    A = py_aho_corasick.Automaton(kv)

    import pickle
    pickled = pickle.dumps(A)
    B = pickle.loads(pickled)

    text = "cashew"
    keywords = B.get_keywords_found(text)
    for idx,k,v in keywords:
        assert text[idx:idx+len(k)] == k
        assert v == kv_dict[k]
    assert len(keywords) == 3
예제 #6
0
def init_py_aho_corasick():
    return py_aho_corasick.Automaton(keyword_list)
예제 #7
0
        A.add_word(key, (idx, key))
    A.make_automaton()
    delta_build1 = time.time() - start_t

    start_t = time.time()
    cnt1 = 0
    for end_index, (insert_order, original_value) in A.iter(text):
        start_index = end_index - len(original_value) + 1
        assert text[start_index:start_index +
                    len(original_value)] == original_value
        cnt1 += 1
    delta_search1 = time.time() - start_t

    # py_aho_corasick
    start_t = time.time()
    A = py_aho_corasick.Automaton(keywords)
    delta_build2 = time.time() - start_t

    start_t = time.time()
    kv = A.get_keywords_found(text)
    cnt2 = 0
    for idx, k, v in kv:
        assert text[idx:idx + len(k)] == k
        cnt2 += 1
    delta_search2 = time.time() - start_t

    # brute force
    start_t = time.time()
    cnt3 = 0
    for kw in keywords:
        beg = 0
예제 #8
0
sentences = splitSentence(a1)

#--------------------------------------------------import disease list as kv-----------------------------------------------------#
#import disease list
kv = []
direDisease = 'diseasesList\\epidemicDiseaseWHO.txt'
f = open(direDisease, 'r')
lines = f.readlines()
for line in lines:
    temp = line.split(',')
    kv.append((temp[0], int(temp[1])))
#print(kv)

#--------------------------------------------------Aho Corasick algorithm-----------------------------------------------------#
i = 0
A = py_aho_corasick.Automaton(kv)

diseaseList = []
for sentence in sentences:
    text = lower(sentence)
    for idx, k, v in A.get_keywords_found(text):
        assert text[idx:idx + len(k)] == k
        assert v == dict(kv)[k]
        diseaseList.append(v)
print(diseaseList)
'''
#output direction
dire = 'temp\\diseaseExtraction' + a2[0] + a2[1] + '.tmp'
f = open(dire,'w')

for sentence in sentences:
예제 #9
0
from py_aho_corasick import py_aho_corasick


def load_entity(filename):
    content = []
    with open(filename, encoding='utf-8') as f:
        for line in f.readlines():
            tt = line.split('\t')
            word = tt[0].replace("\n", "")
            tag = tt[1].replace("\n", "")
            content.append((word, tag))
    return content


# keywords only
# entity_dict = [(u'SK텔레콤',u'COMPANY'), (u'5G',u'TERM')]

entity_dict = load_entity("wiki_entity_100000.txt")
A = py_aho_corasick.Automaton(entity_dict)

with open("data/17670_N1.txt", "r", encoding='utf-8') as f:
    for line in f.readlines():
        line_concat = line.decode("utf-8").replace(" ", "")
        # print line
        for idx, k, v in A.get_keywords_found(line_concat):
            print
            idx, k.encode('utf-8'), v