def __init__(self, kg, env): self.kg = kg # self.input_bucket = input_bucket # self.output_bucket = output_bucket self.trie = marisa_trie.Trie(list(kg.entity_industry)) self.vocab = Vocab(env['GRAPH_BUCKET'], env['KG_VOCAB_KEY']) self.model = FastHan()
def get_info(sentence, n=-1): ''' 人物与观点的信息提取 sentence: 待提取观点的句子 answer: 字典,key为人物,value为任务相应的观点 ''' s_simp = to_simplified(sentence) #繁转简 sim_words = load_similar_words(n) #载入'说'的近义词 #依存关系分析 model = FastHan() dep_answer = model(s_simp, target="Parsing") dep_tree = sen2tree(dep_answer) #命名实体识别 ner = model(s_simp, target="NER") ner_word = [ner[0][i].word for i in range(len(ner[0]))] ner_ind = [i for i in range(len(dep_tree)) if dep_tree[i].word in ner_word] #信息提取 answer = {} for node in dep_tree: if node.pos == 'VV' and node.word in sim_words: speaker, point = extrat_info(dep_tree, node, ner_ind) answer[speaker] = point return answer
def test_call(self): sentence=['一行人下得山来,走不多时,忽听前面猛兽大吼之声一阵阵的传来。', '韩宝驹一提缰,胯下黄马向前窜出,奔了一阵,忽地立定,不论如何催迫,黄马只是不动。', '韩宝驹心知有异,远远望去,只见前面围了一群人,有几头猎豹在地上乱抓乱扒。' '他知坐骑害怕豹子,跃下马来,抽出金龙鞭握在手中。'] targets=['CWS','POS','NER','Parsing'] model = FastHan('large') for target in targets: model(sentence,target)
def __init__(self, model_name, ip=None): config = Config() self.model_name = model_name if self.model_name == "fasthan": self.nltk_model = FastHan(model_type="base") if self.model_name == "stanford": path = config.project_dir self.stanford_model = StanfordCoreNLP(os.path.join(path, 'model\stanford-corenlp-full-2016-10-31'), lang='zh') if self.model_name == "bbc": if not ip: raise ValueError("bbc模型必须填入ip") self.bbc_model = BertClient(ip, ner_model_dir=None, show_server_config=False, check_version=False, check_length=False, mode='NER')
def reprocess(path, file): model_fastHan = FastHan(model_type='large') result = {} with open(os.path.join(path, file), 'r', encoding='utf-8') as f: data = json.load(f) cnt = 0 for k, v in data.items(): try: if v['is_process'] == False: print(k, v['sentence']) cnt += 1 subjects, tuples, simplified_tuples = extract_subjects( v['sentence'], model_fastHan) v['subjects'] = subjects v['tuples'] = tuples v['simplified_tuples'] = simplified_tuples v['is_process'] = False result[k] = v except: pass print(cnt) with open(os.path.join(path, 'new_{}'.format(file)), 'w', encoding='utf-8') as f: json.dump(result, f)
import sys import io from fastHan import FastHan sys.stdout= io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')#改变标准输出的默认编码 model = FastHan() sentance =sys.argv[1] answer = model(sentance,target="Parsing") str = '' for i,sentance in enumerate(answer): for token in sentance: # print(token.word) str = str + token.word + ',' print(str)
def test_init(self): # 测试是否可以正确initialize model = FastHan() model = FastHan('large')
def __init__(self, kg): self.kg = kg self.trie = marisa_trie.Trie(list(kg.entity_industry)) self.vocab = Vocab() self.model = FastHan()
next_word, _, _, next_pos = tuples[j] if next_pos in ['DEC', 'DEV']: subject = subject + next_word subj_pos = '+'.join([subj_pos, next_pos]) break subjects.append([subject, subj_pos]) position += len(word) return subjects, tuples, simplified_tuples def write_data(data, index): with open('static/data/data_{}.json'.format(index), 'w') as f: data = json.dumps(data, ensure_ascii=False) f.write(data) if __name__ == '__main__': model_fastHan = FastHan() result = {} with open('0subject_rand5000.txt', 'r', encoding='utf-8') as f: cnt = 0 temp = enumerate(f.readlines()) for index, line in tqdm(temp): line = line.strip() subjects, tuples, simplified_tuples = extract_subjects(line.split('::')[1], model_fastHan) result[index] = { 'id': str(index), 'is_process': False, 'tokens': [], 'type': None, 'validity': None, 'subjects': subjects, 'tuples': tuples,
def fasthan_cws(): model = FastHan() sentence = "郭靖是金庸笔下的一名男主。" result = model(sentence, 'CWS', use_dict=False) print(result)
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from fastHan import FastHan model = FastHan() sentence = "郭靖是金庸笔下的一名男主。" answer = model(sentence, target="Parsing") print(answer) answer = model(sentence, target="NER") print(answer) sentence = "一个苹果。" print(model(sentence, 'CWS')) model.set_cws_style('cnc') print(model(sentence, 'CWS')) sentence = ["我爱踢足球。", "林丹是冠军"] answer = model(sentence, 'Parsing') for i, sentence in enumerate(answer): print(i) for token in sentence: print(token, token.pos, token.head, token.head_label)
def __init__(self): from fastHan import FastHan self.model = FastHan()