def test_keywords_only(): A = py_aho_corasick.Automaton(['cash', 'shew', 'ew']) text = "cashew" keywords = A.get_keywords_found(text) for idx,k,v in keywords: assert text[idx:idx+len(k)] == k assert len(keywords) == 3
def wc_by_ner(file_name): entities = [(u'SK텔레콤', u'COMPANY'), (u'5G', u'TERM')] with open("wiki_entity_100000.txt", "r") as f: for line in f.readlines(): line = line.decode('utf-8').strip() sp = line.split("\t") word = sp[0] tag = sp[1] # print word t = (word, tag) entities.append(t) B = py_aho_corasick.Automaton(entities) dict = {} with open(file_name) as f: for line in f.readlines(): line_entity = [] line_word = [] line_concat = line.decode("utf-8").replace(" ", "") for idx, k, v in B.get_keywords_found(line_concat): # print idx, k, v line_entity.append(k.upper()) result = mecab.pos(line.decode("utf-8").strip()) for r in result: flag = False word = r[0].upper() tag = r[1] if tag not in tag_list: continue if len(word) < 1: continue for entity in line_entity: if word in entity: flag = True continue if (flag == False): line_word.append(word) # print word for word in line_entity: word = word.upper() if word not in dict: dict[word] = 1 else: dict[word] = dict[word] + 1 for word in line_word: word = word.upper() if word not in dict: dict[word] = 1 else: dict[word] = dict[word] + 1 wordcloud = WordCloud(background_color="white", font_path=path.join(d + 'fonts', "NanumMyeongjo.ttf")) wordcloud.fit_words(dict) wordcloud.to_file(path.join(d, "wc.png")) utils.send_image(path.join(d, "wc.png"))
def test_utf8(): kv = [(u'哈哈'.encode('utf8'),1), (u'你好'.encode('utf8'),2), (u'算我shu'.encode('utf8'),3)] kv_dict = dict(kv) A = py_aho_corasick.Automaton(kv) text = u'你好哈哈算我shu咯'.encode('utf8') keywords = A.get_keywords_found(text) for idx,k,v in keywords: assert text[idx:idx+len(k)] == k assert v == kv_dict[k] assert len(keywords) == 3
def test_keywords_and_values(): kv = [('cash',1), ('shew',2), ('ew',3)] kv_dict = dict(kv) A = py_aho_corasick.Automaton(kv) text = "cashew" keywords = A.get_keywords_found(text) for idx,k,v in keywords: assert text[idx:idx+len(k)] == k assert v == kv_dict[k] assert len(keywords) == 3
def test_pickle(): kv = [('cash',1), ('shew',2), ('ew',3)] kv_dict = dict(kv) A = py_aho_corasick.Automaton(kv) import pickle pickled = pickle.dumps(A) B = pickle.loads(pickled) text = "cashew" keywords = B.get_keywords_found(text) for idx,k,v in keywords: assert text[idx:idx+len(k)] == k assert v == kv_dict[k] assert len(keywords) == 3
def init_py_aho_corasick(): return py_aho_corasick.Automaton(keyword_list)
A.add_word(key, (idx, key)) A.make_automaton() delta_build1 = time.time() - start_t start_t = time.time() cnt1 = 0 for end_index, (insert_order, original_value) in A.iter(text): start_index = end_index - len(original_value) + 1 assert text[start_index:start_index + len(original_value)] == original_value cnt1 += 1 delta_search1 = time.time() - start_t # py_aho_corasick start_t = time.time() A = py_aho_corasick.Automaton(keywords) delta_build2 = time.time() - start_t start_t = time.time() kv = A.get_keywords_found(text) cnt2 = 0 for idx, k, v in kv: assert text[idx:idx + len(k)] == k cnt2 += 1 delta_search2 = time.time() - start_t # brute force start_t = time.time() cnt3 = 0 for kw in keywords: beg = 0
sentences = splitSentence(a1) #--------------------------------------------------import disease list as kv-----------------------------------------------------# #import disease list kv = [] direDisease = 'diseasesList\\epidemicDiseaseWHO.txt' f = open(direDisease, 'r') lines = f.readlines() for line in lines: temp = line.split(',') kv.append((temp[0], int(temp[1]))) #print(kv) #--------------------------------------------------Aho Corasick algorithm-----------------------------------------------------# i = 0 A = py_aho_corasick.Automaton(kv) diseaseList = [] for sentence in sentences: text = lower(sentence) for idx, k, v in A.get_keywords_found(text): assert text[idx:idx + len(k)] == k assert v == dict(kv)[k] diseaseList.append(v) print(diseaseList) ''' #output direction dire = 'temp\\diseaseExtraction' + a2[0] + a2[1] + '.tmp' f = open(dire,'w') for sentence in sentences:
from py_aho_corasick import py_aho_corasick def load_entity(filename): content = [] with open(filename, encoding='utf-8') as f: for line in f.readlines(): tt = line.split('\t') word = tt[0].replace("\n", "") tag = tt[1].replace("\n", "") content.append((word, tag)) return content # keywords only # entity_dict = [(u'SK텔레콤',u'COMPANY'), (u'5G',u'TERM')] entity_dict = load_entity("wiki_entity_100000.txt") A = py_aho_corasick.Automaton(entity_dict) with open("data/17670_N1.txt", "r", encoding='utf-8') as f: for line in f.readlines(): line_concat = line.decode("utf-8").replace(" ", "") # print line for idx, k, v in A.get_keywords_found(line_concat): print idx, k.encode('utf-8'), v