def initialize_files(self, entity_type): """ This function checks the type of entity. We have currently done it for entity_type='city'. If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the tagger and model_path accordingly Args: entity_type: type of entity """ global CITY_MODEL_OBJECT, DATE_MODEL_OBJECT if entity_type == CITY_ENTITY_TYPE: self._model_path = CITY_MODEL_PATH if not CITY_MODEL_OBJECT: CITY_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path) ner_logger.debug('CITY CRF model loaded %s' % self._model_path) self.tagger = CITY_MODEL_OBJECT elif entity_type == DATE_ENTITY_TYPE: self._model_path = DATE_MODEL_PATH if not DATE_MODEL_OBJECT: DATE_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path) ner_logger.debug('date CRF model loaded %s' % self._model_path) self.tagger = DATE_MODEL_OBJECT
def create_model(seg_file=None, pos_file=None): the_file = os.path.dirname(os.path.abspath(__file__)) if not seg_file: seg_file = os.path.join(the_file, 'data', 'crf_seg.model') if not pos_file: pos_file = os.path.join(the_file, 'data', 'crf_pos.model') seg_model = CRFPP.Model("-m %s" % seg_file) pos_model = CRFPP.Model("-m %s" % pos_file) return (seg_model, pos_model)
def locationNER(text): #tagger = load_model('model') tagger=CRFPP.Tagger('-m model -v 3 -n2') for c in text: tagger.add(c) result = [] tagger.parse() #print(tagger.size()) word = '' for i in range(0, tagger.size()): for j in range(0, tagger.xsize()): ch = tagger.x(i, j) print(ch) tag = tagger.y2(i) print(tag) if tag == 'B': word = ch elif tag == 'M': word += ch elif tag == 'E': word += ch result.append(word) elif tag == 'S': word = ch result.append(word) return result
def getCx(content_words): """ 得到词性 :param content_words:注意 单词是按照在句子中的顺序排列的 :return: 返回对应的单词的tag """ tags=list() try: tagger = CRFPP.Tagger("-m " + "crf_model_1") tagger.clear() for word in content_words: word = word.strip() if word: tagger.add(word.encode('utf-8')) tagger.parse() size = tagger.size()#tagger.add的数据大小 xsize = tagger.xsize() for i in range(0, size): for j in range(0, xsize): #char = tagger.x(i, j).decode('utf-8') tag = tagger.y2(i) tags.append(tag) # except RuntimeError, e: # print "RuntimeError: ", e, return tags
def __init__(self, model_path=None, args="-v 3 -n1"): if model_path is None: model_path = os.path.join(os.path.dirname(__file__), "./../model/seg.crf.model") if args is None: args = "-v 3 -n1" self.model = CRFPP.Tagger("-m " + model_path + " " + args)
def __init__(self, name): model_path = train_path + name + "/" which_model = model_path + "model" arguments = "-m " + which_model + " -v 3 -n2" self.labels = read_labels(model_path) self.tagger = CRFPP.Tagger(arguments) self.name = name
def load_model(path): import os, CRFPP # -v 3: access deep information like alpha,beta,prob # -nN: enable nbest output. N should be >= 2 if os.path.exists(path): return CRFPP.Tagger('-m {0} -v 3 -n2'.format(path)) return None
def selectDocByEntropy(pool_sentence_index, index_dic, datalines, model, n): tagger = CRFPP.Tagger("-m " + model + " -v 3 -n2") dic_sentence_entropy = [] for index in pool_sentence_index: begin, end = index_dic[index] tagger.clear() for line in datalines[begin:end]: words = line.split('\t') newLine = '\t'.join(words[:-1]) tagger.add(newLine) tagger.parse() ysize = tagger.ysize() #tag1 tag2 tag3... size = tagger.size() #char1 char2 char3... entropy = 0.0 for i in range(0, size): #wordCurr = tagger.x(i, 0) #tagCurr = tagger.y2(i) for j in range(0, ysize): prob = tagger.prob(i, j) entropy -= prob * math.log(prob) / math.log(2) #entropy /= size dic_sentence_entropy.append((index, entropy)) dic_sentence_entropy.sort(key=lambda x: x[1], reverse=True) if len(dic_sentence_entropy) > n: return [x[0] for x in dic_sentence_entropy[:n]] else: return [x[0] for x in dic_sentence_entropy]
def __init__(self, pathModel=r"./crfpp_model.bin"): """ 初始化方法. Args: pathModel: string, CRF++模型路径 Return: None """ # -v 3: access deep information like alpha, beta, prob # -n N: enable N best output. N should be >= 2 self.tagger = CRFPP.Tagger("-m " + pathModel + " -v 3 -n 2") # clear internal context self.tagger.clear() # 输入特征列数 print("column size: {}".format(self.tagger.xsize())) # 输入tokens(即句子)数 # print("token size: {}".format(self.tagger.size())) # 序列标注输出类别数 print("tag size: {}".format(self.tagger.ysize())) self.tagsList = [ self.tagger.yname(i) for i in range(self.tagger.ysize()) ] print("tag set: {}".format(self.tagsList)) self.classes_ = self.tagsList return
def processing(): jieba.load_userdict("../../../data/userdict.txt") tagger = CRFPP.Tagger("-m ../../NER/train/model") TestCol = db["test"] data = TestCol.find() try: for d in data: view = [] content = d["content"] # _content = [] words = pseg.cut(content) for w in words: tag = w.word.encode("utf-8") + " " + w.flag.encode("utf-8") # _content.append(w.word) tagger.add(tag) tagger.parse() ysize = tagger.ysize() size = tagger.size() xsize = tagger.xsize() for i in range(size): if tagger.y2(i) == "Y": view.append(tagger.x(i,0)) d["view"] = view # d["content"] = _content TestCol.save(d) tagger.clear() except RuntimeError, e: print "RuntimeError: ", e,
def __init__(self, table, sym, conf): super(LTGenCRF, self).__init__(table, sym) model = conf.get("log_template_crf", "model_filename") self._middle = conf.get("log_template_crf", "middle_label") self._crf = CRFPP.Tagger("-m " + model + " -v 3 -n2") if self._middle == "re": import label_word self._lwobj = label_word.LabelWord(conf)
def __init__(self, model_file, verb_level=2, best_out_n=2): try: self.m, self.v, self.bn = model_file, verb_level, best_out_n self.tagger = CRFPP.Tagger("-m %s -v %i -n%i" % (model_file, verb_level, best_out_n)) logger.info("CRFPP Tagger initialized with command %s" % ("-m %s -v %i -n%i" % (self.m, self.v, self.bn))) except RuntimeError, e: print "RuntimeError: ", e,
def use_model(self, model_path): """ use a alread trained model. @type modelPath: string @param modelPath: the path of the model file """ #self.model = modelPath self.tagger = CRFPP.Tagger("-m %s -v3" % model_path)
def __init__(self): self.tagger = CRFPP.Tagger("-m /app/models/MentionExtractionUB.Model") self.stemmer = SnowballStemmer('english') # self.pos_tagger = PerceptronTagger() self.regex_dna_mutation_str = utils.readlines( '/app/models/tmvar_regexes/DNAMutation.RegEx.txt') self.regex_protein_mutation_str = utils.readlines( '/app/models/tmvar_regexes/ProteinMutation.RegEx.txt') self.regex_snp_mutation_str = utils.readlines( '/app/models/tmvar_regexes/SNP.RegEx.txt')
def crfTest(lst,model_name): if model_name == "DNA": tagger = CRFPP.Tagger("-m bioNER/static/model-5-20-DNA-protein") elif model_name == "RNA": tagger = CRFPP.Tagger("-m bioNER/static/model-5-20-RNA-protein") elif model_name == "cell": tagger = CRFPP.Tagger("-m bioNER/static/model-5-20-cell-cell") for line in lst: tagger.add('\t'.join(map(str,line))) tagger.parse() ysize = tagger.ysize() size = tagger.size() xsize = tagger.xsize() taglst = [tagger.y2(i) for i in range(len(lst))] result = [] for i in range(len(lst)): tmp = [ lst[i][0],taglst[i] ] result.append(tmp) return result
def main(): parser = argparse.ArgumentParser() parser.add_argument('model',help='model path.') parser.add_argument('-n','--num',help='top n result.') parser.add_argument('-e','--evaldata',help='evaluation data with ground truth.') parser.add_argument('-t','--testdata',help='test data without ground truth.') opt = parser.parse_args() tagger = CRFPP.Tagger("-m {} -n {} ".format(opt.model,opt.num)) if opt.testdata: test(tagger,opt.testdata) if opt.evaldata: evaluate(tagger,opt.evaldata)
def selectDocByTfidf(trainset, poolset, model, n): chosen = [] char_dic, char_filename_dic = calculateCharRate(poolset) tagger = CRFPP.Tagger("-m " + model + " -v 3 -n2") dic_filename_entropy = [] for filename in poolset: tagger.clear() f = open(os.path.join(samplefolder, filename + '.crf'), 'r') entropy_sum = 0.0 line_num = 0 line = f.readline() while line: if len(line.strip()) <= 1: tagger.parse() ysize = tagger.ysize() #tag1 tag2 tag3... size = tagger.size() #char1 char2 char3... entropy = 0.0 for i in range(0, size): wordCurr = tagger.x(i, 0) word_entropy = 0.0 rate = len(char_filename_dic[wordCurr]) tagCurr = tagger.y2(i) for j in range(0, ysize): prob = tagger.prob(i, j) word_entropy -= prob * math.log(prob) / math.log(2) entropy += word_entropy * rate entropy /= size tagger.clear() line_num += 1 entropy_sum += entropy line = f.readline() else: words = line.split('\t') newLine = '\t'.join(words[:-1]) tagger.add(newLine) line = f.readline() entropy_sum /= line_num dic_filename_entropy.append((filename, entropy_sum)) f.close() dic_filename_entropy.sort(key=lambda x: x[1], reverse=True) if len(dic_filename_entropy) > n: return [x[0] for x in dic_filename_entropy[:n]] else: return [x[0] for x in dic_filename_entropy]
def segment(sentence, model): try: tagger = CRFPP.Tagger("-m " + model) # clear internal context tagger.clear() l_features = features splitted = split_enumerate(sentence, '.') raws = [word for word, span in splitted] words = [(normalize(word), span) for word, span in splitted] words_length = len(words) raws = iter(raws) if words_length == 0: return [(0, 0)] else: lastword, last_span = words.pop() words = iter(words) last_span = str(last_span[0]), str(last_span[0]) # add context for i, (w, span) in enumerate(words): nextline = '\t'.join((next(raws), ) + l_features(w, u'LF%s' % (i, ))).encode('utf-8') tagger.add(nextline) if i >= 1: break for w, span in words: # s_span = (str(span[0]), str(span[1])) nextline = '\t'.join((next(raws), ) + l_features(w, u'MID')).encode('utf-8') tagger.add(nextline) nextline = '\t'.join((next(raws), ) + l_features(lastword, u'RHT')).encode('utf-8') tagger.add(nextline) # Parse and change internal stated as 'parsed' tagger.parse() anchors = crf_anchors(tagger, splitted) # print "Done tagging crf" return anchors except RuntimeError as e: print("RuntimeError: ", e, end=' ')
def __init__( self, modelfile=None, nbest=None, ): if not nbest: nbest = 1 if not modelfile: assert False self.tagger = CRFPP.Tagger('-n ' + str(nbest) + ' -m ' + modelfile) self.tagger.clear() self.begin = "#SENT_BEG#\tbegin\tOUT" self.end = "#SENT_BEG#\tend\tOUT" self.terms = []
def parse_sentence(pharse): try: text = nltk.word_tokenize(pharse) POS = nltk.pos_tag(text) #get the POS for CRF++ tagger = CRFPP.Tagger("-m model_file -v 3 -n2") # clear internal context tagger.clear() for i in range(0, len(POS)): word = (POS[i][0] + ' ' + POS[i][1] ) # add the word and its POS together tagger.add(word) tagger.parse() # parse the sentence except RuntimeError, e: print "RuntimeError: ", e,
def initialize_files(self, entity_type): """ This function checks the type of entity. We have currently done it for entity_type='city'. If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the tagger and model_path accordingly Args: entity_type: type of entity """ if entity_type == CITY_ENTITY_TYPE: self._model_path = CITY_MODEL_PATH self.tagger = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path)
def chunk(self, string): import CRFPP tagger = CRFPP.Tagger("-m " + self.modelFile) indexes, tokens = tokenize(string) tags = self.tagger.tag(tokens) tagger.clear() for word, posTag in tags: tagger.add(str("%s %s" % (word, posTag))) tagger.parse() for i, (index, token) in enumerate(zip(indexes, tokens)): label = tagger.y2(i) #print index, token, label labels = [tagger.y2(i) for i in range(len(tokens))] return indexes, tokens, labels
def add_tagger(self, tag_data): """ 函数说明: 添加语料 :param tag_data: 数据 :return: """ word_str = tag_data.strip() if not os.path.exists(self.model): print('模型不存在,请确认模型路径是否正确!') exit() tagger = CRFPP.Tagger("-m {} -v 3 -n2".format(self.model)) tagger.clear() for word in word_str: tagger.add(word) tagger.parse() return tagger
def __init__(self, model_dir=__default_model_dir__): MIDic = path.join(model_dir, 'DefaultMI.pickle') WordDic = path.join(model_dir, 'WordDic.pickle') CRFModel = path.join(model_dir, 'DefaultModel') # 載入 char bigram MI self.MIDic = pickle.load(open(MIDic, 'rb')) # 載入簡單字典 self.WordDic = pickle.load(open(WordDic, 'rb')) # 指定 CRF 模型檔名 self.CRFModel = CRFModel # CRF 模組 self.tagger = CRFPP.Tagger("-m " + self.CRFModel) self._to_tokens_rule = re.compile(Segmentor._to_tokens_pattern)
def add_tagger(self, tag_data_file): """ 函数说明: 添加语料 :param tag_data: 数据 :return: """ tag_data_file = open(tag_data_file, 'r') if not os.path.exists(self.model): print('模型不存在,请确认模型路径是否正确!') exit() tagger = CRFPP.Tagger("-m {} -v 3 -n2".format(self.model)) tagger.clear() while True: line = tag_data_file.readline() if line == '': break tagger.add(line.strip()) tagger.parse() return tagger
def locationNER(text): tagger = CRFPP.Tagger("-m {0} -v 3 -n2".format("data/modelwithflag")) for c in text: tagger.add(c) res = [] tagger.parse() builder = "" for i in range(tagger.size()): for j in range(tagger.xsize()): ch = tagger.x(i, j) tag = tagger.y2(i) if tag == "B": builder = ch elif tag == "M": builder += ch elif tag == "E": builder += ch res.append(builder) elif tag == "S": builder = ch res.append(builder) return res
def crf_segmenter(input_file, output_file, tagger): input_data = codecs.open(input_file, 'r', 'utf-8') output_data = codecs.open(output_file, 'w', 'utf-8') for line in input_data.readlines(): tagger.clear() for word in line.strip(): word = word.strip() if word: tagger.add((word + "\to\tB").encode('utf-8')) tagger.parse() size = tagger.size() xsize = tagger.xsize() for i in range(0, size): for j in range(0, xsize): char = tagger.x(i, j).decode('utf-8') tag = tagger.y2(i) if tag == 'B': output_data.write(char) elif tag == 'M': output_data.write(char) elif tag == 'E': output_data.write(char + ' ') else: # tag == 'S' output_data.write(char + ' ') output_data.write('\n') input_data.close() output_data.close() if __name__ == '__main__': if len(sys.argv) != 4: print "pls use: python crf_segmenter.py model input output" sys.exit() crf_model = sys.argv[1] input_file = sys.argv[2] output_file = sys.argv[3] tagger = CRFPP.Tagger("-m " + crf_model) crf_segmenter(input_file, output_file, tagger)
if word: tagger.add((word + '\to\tB')) tagger.parse() size = tagger.size() xsize = tagger.xsize() for i in range(size): for j in range(xsize): char = tagger.x(i, j) tag = tagger.y2(i) if tag == 'B': test_result_file.write(' ' + char) elif tag == 'M': test_result_file.write(char) elif tag == 'E': test_result_file.write(char + ' ') else: test_result_file.write(' ' + char + ' ') test_result_file.write('\n') test_file.close() test_result_file.close() if __name__ == '__main__': crf = CRF() # crf.tagging() tagger = CRFPP.Tagger('-m' + crf_model) crf.segment(tagger) e = Evaluate() e.evaluate() e.result()
def load_model(path): if os.path.exists(path): return CRFPP.Tagger('-m{0} -v 3 -n 2'.format(path)) return None
def __setstate__(self, dict): self.__dict__.update(dict) self.tagger = CRFPP.Tagger("-m %s -v %i -n%i" % (self.m, self.v, self.bn))