def instances(fi): xseq = crfsuite.ItemSequence() fi = 'If the' fi = fi.split(' ') for line in fi: print "********************" line = line.strip('\n') if not line: # An empty line presents an end of a sequence. yield xseq xseq = crfsuite.ItemSequence() continue # Split the line with TAB characters. fields = line.split('\t') item = crfsuite.Item() for field in fields[1:]: p = field.rfind(':') if p == -1: # Unweighted (weight=1) attribute. item.append(crfsuite.Attribute(field)) else: # Weighted attribute item.append(crfsuite.Attribute(field[:p], float(field[p + 1:]))) # Append the item to the item sequence. xseq.append(item)
def instances(fi): xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() for line in fi: line = line.strip('\n') if not line: # An empty line presents an end of a sequence. yield xseq, tuple(yseq) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() continue # Split the line with TAB characters. fields = line.split('\t') # Append attributes to the item. item = crfsuite.Item() for field in fields[1:]: p = field.rfind(':') if p == -1: # Unweighted (weight=1) attribute. item.append(crfsuite.Attribute(field)) else: # Weighted attribute item.append(crfsuite.Attribute(field[:p], float(field[p + 1:]))) # Append the item to the item sequence. xseq.append(item) # Append the label to the label sequence. yseq.append(fields[0])
def read_file_to_crfsuite(crf_input_file, crf_trainer, feature_inclusion_list, participant_list): # if os.path.isfile('min_max_dataframe'): # min_max = pd.load('min_max_dataframe') # else: # min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list) #min_max = get_min_max_scaling_values (crf_input_file, feature_inclusion_list) import crfsuite f = open(crf_input_file, 'r') feature_index_list = [] header = [] xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() for line in f: # ignore blank lines if line.strip(' \t\n\r') == "": continue if "label" in line: feature_index_list = get_feature_index_list( line, feature_inclusion_list) header = line.split('\t') continue if "START" in line: continue if "END" in line: print 'found END' # exit() if participant not in participant_list: continue crf_trainer.append(xseq, yseq, participant_group) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() else: item = crfsuite.Item() fields = line.split('\t') participant = fields[1].strip('"') if participant not in participant_list: print 'participant ' + participant + ' not found' continue participant_group = participant_list.index(participant) for i in range(0, len(fields)): if i in feature_index_list: attribute_name = header[i] if (fields[i] == 'NA'): attribute_val = 0 else: attribute_val = float(fields[i]) item.append( crfsuite.Attribute(attribute_name, attribute_val)) xseq.append(item) yseq.append(fields[0].strip('"'))
def read_file_to_crfsuite(crf_input_file, feature_inclusion_list, crf_tagger, output, options_dict): sliding_window_length = options_dict['sliding_window_length'] import crfsuite f = open(crf_input_file, 'r') #min_max = pd.load('min_max_dataframe') xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() overlapped_predictions = [] for line in f: if line.strip(' \t\n\r')=="": continue if "label" in line: feature_index_list = crf_train.get_feature_index_list(line, feature_inclusion_list) header = line.split('\t') continue if "START" in line: continue if "END" in line: #crf_tagger.set(xseq) #prediction_seq = util.convert_to_python_list(crf_tagger.tag(xseq)) prediction_seq = crf_tagger.tag(xseq) label_seq = util.convert_to_python_list (yseq) if (sliding_window_length != 0 ): overlapped_predictions = write_prediction_to_file (prediction_seq, label_seq, overlapped_predictions, output , options_dict) else: y_itr = yseq.iterator() for prediction in prediction_seq: #print 'straightforward' label = y_itr.next() output.write(prediction.strip() + "," + label.strip()+"\n") xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() else: item = crfsuite.Item() fields = line.split('\t') for i in range(0,len(fields)): if i in feature_index_list: attribute_name = header[i] if(fields[i] == 'NA'): attribute_val = 0 else: attribute_val = float(fields[i]) item.append(crfsuite.Attribute(attribute_name, attribute_val)) xseq.append(item) yseq.append(fields[0].strip('"'))
def instances(self, fi): xseq = crfsuite.ItemSequence() for line in fi: line = line.strip('\n') if not line: yield xseq xseq = crfsuite.ItemSequence() continue item = crfsuite.Item() fields = line.split('\t') for field in fields[1:]: p = field.rfind(':') if p == -1: item.append(crfsuite.Attribute(field)) else: item.append( crfsuite.Attribute(field[:p], float(field[p + 1:]))) xseq.append(item)
def selectActiveData(unselected, selected, model, num): select = [] tagger_bp = crfsuite.Tagger() tagger_bp.open(model) bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] entropy = [] for line in unselected: sentence, entities_in_sentence = processing.generateSenEntities( line, '') new_sentence, new_entities = processing.symbolProcess( sentence, entities_in_sentence) sentence_unicode = new_sentence.decode('utf-8') tag_seq = processing.generateTagSeq(sentence_unicode, new_entities) feature_string, tags = generateFeature.boundaryFeatureGeneration( new_sentence, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % line xseq = crfsuite.ItemSequence() features = [] for instance in instances: fields = instance.split('\t') features.append(fields[2:]) item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_bp.set(xseq) yseq_b = tagger_bp.viterbi() length = len(yseq_b) yseq = [] ie_entity = 0.0 for i in range(length): yseq.append(yseq_b[i]) for j in range(len(yseq)): ie = 0.0 # 信息熵 for ent_tag in bieso: try: tag_prob = tagger_bp.marginal(ent_tag, j) ie += tag_prob * math.log(1 / tag_prob, 2) except Exception, e: print line exit(0) ie_entity += ie entropy.append((line, ie_entity))
def tag_raw(self, data): # data is a list of lists, which may very well be just 1-long # data = [['The'], ['man'], ['barked']] # The sublists maybe contain tuples (of string->float pairs) # data = [['The', ('first', 1)], ['man', 'human', ('first', 0)], ...] items = crfsuite.ItemSequence() for datum in data: item = crfsuite.Item() for feature in datum: if isinstance(feature, tuple): item.append(crfsuite.Attribute(*feature)) else: item.append(crfsuite.Attribute(feature)) items.append(item) return self.tag(items)
def to_crfsuite(X): """ Convert an item sequence into an object compatible with crfsuite Python module. @type X: list of mapping objects @param X: The sequence. @rtype crfsuite.ItemSequence @return The same sequence in crfsuite.ItemSequence type. """ xseq = crfsuite.ItemSequence() for x in X: item = crfsuite.Item() for f in x['F']: item.append(crfsuite.Attribute(f)) xseq.append(item) return xseq
def append_raw(self, features_seq, labels): # len(labels) = len(data) = length of sentence / sequence # labels is a tuple of strings, data is an tuple/list of lists of strings. # this just wraps all the data / labels with crfsuite types items = crfsuite.ItemSequence() for features in features_seq: item = crfsuite.Item() for feature in features: if isinstance(feature, tuple): attribute = crfsuite.Attribute(*feature) else: attribute = crfsuite.Attribute(feature) item.append(attribute) items.append(item) # labels = crfsuite.StringList(labels) self.append(items, tuple(labels), 0)
def predictValue(feature_str): try: instances = feature_str.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_str xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: # S3tag\tS1tag\tFeatures item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_b.set(xseq) yseq_b = tagger_b.viterbi() prob_b = tagger_b.probability(yseq_b) return prob_b
def read_svm_format(lines): # reads lines like: # Y 15:0.4 16:0.01 19:3.4 # or # X 4 9 23 # and iterates over sentences (which are separated by a whitespace-only line). # it yield pairs like: # (crfsuite.ItemSequence([ # crfsuite.Item([crfsuite.Attribute("15"->0.4), crfsuite.Attribute("16"->0.01), crfsuite.Attribute("19"->3.4)]), # crfsuite.Item([crfsuite.Attribute("4"), crfsuite.Attribute("9"), crfsuite.Attribute("23")]) # ]), ("X", "Y", ...)) # crfsuite.Attribute has 2 properties: attr, and value for sentence_lines in group_by_newline(lines): data = crfsuite.ItemSequence() labels = crfsuite.StringList() for line in sentence_lines: # Split the line with TAB characters. # print '>>>', line cells = line.strip().split(' ') datum = crfsuite.Item() for data_field in cells[1:]: # don't split if the whole field is a literal colon parts = data_field.rsplit( ':', 1) if data_field != ':' else data_field if len(parts) > 1: # we read the optional weight: datum.append(crfsuite.Attribute(parts[0], float(parts[1]))) else: # otherwise, weight = 1 by default datum.append(crfsuite.Attribute(parts[0])) # Append the item to the item sequence. data.append(datum) # Append the label to the label sequence. labels.append(cells[0]) # empty line is document boundary yield (data, tuple(labels))
def mainfunction(sen, postProcess, texttype, index): model_b = os.path.join(root, './models/boundarymodel-' + index) model_c = os.path.join(root, './models/classmodel-' + index) ner_lines = '' tagger_b = crfsuite.Tagger() tagger_b.open(model_b) tagger_c = crfsuite.Tagger() tagger_c.open(model_c) bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] line = sen.strip() # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_b.set(xseq) yseq_b = tagger_b.viterbi() line_unicode = line.decode('utf-8') model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 entities = [] new_entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = line_unicode[ent_start:ent_end].encode('utf-8') entities.append(Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( line, entities, ebao_dic, texttype) instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_c.set(xseq) yseq_c = tagger_c.viterbi() ents_type.append(yseq_c[0]) # postProcessing new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos enttype = ents_type[j] if start + 1 == end: new_yseq[start] = 'S-' + enttype continue new_yseq[start] = 'B-' + enttype for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + enttype new_yseq[end - 1] = 'E-' + enttype if postProcess == '1': # 评价中的start_end_list没有调整 new_yseq = postProcessing.twoProcessings(line_unicode, new_yseq, ebao_dic, texttype) ents1, s_e_list1 = evaluation.generateEntList([new_yseq]) new_entities = ents1[0] entity_list = '' length = len(new_entities) for i in range(length): content = line_unicode[new_entities[i][0]:new_entities[i][1]] enttype = new_entities[i][2] if enttype == '': print line_unicode.encode('utf8'), line_unicode[ new_entities[i][0]:new_entities[i][1]].encode('utf8') entity_list += content.encode( 'utf8') + '[' + en_cn_dic[enttype] + ']\n' return entity_list, new_yseq
def mainfunction(inputstring, taggerb, taggerc): if inputstring == '': sentence_ner = '请输入句子' return sentence_ner # 一些句子预处理 inputsentence = tools.uniformSignal(inputstring) ner_lines = '' bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] new_term_list = '' for single_line in inputsentence.split('\n'): lines = tools.sentence_split(single_line) ner_line = '' term_list = '' for line in lines: line = line.strip() # 去除标签部分,以<开头且以>结尾的过滤 if line == '' or line[0] == '<' and line[-1] == '>': continue # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) taggerb.set(xseq) yseq_b = taggerb.viterbi() prob_b = taggerb.probability(yseq_b) line_unicode = line.decode('utf-8') # for t, y in enumerate(yseq_b): # # Output the predicted labels with their marginal probabilities. # ner_line += '%s:%f\n' % (y, taggerb.marginal(y, t)) model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList( [yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 sentence = line entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = sentence.decode( 'utf-8')[ent_start:ent_end].encode('utf-8') entities.append( Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( sentence, entities, ebao_dic, texttype) instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) taggerc.set(xseq) yseq_c = taggerc.viterbi() ents_type.append(yseq_c[0]) new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos if start + 1 == end: new_yseq[start] = 'S-' + ents_type[j] continue new_yseq[start] = 'B-' + ents_type[j] for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + ents_type[j] new_yseq[end - 1] = 'E-' + ents_type[j] sen_ent_colored, ent_list = generateNerInSentence( line_unicode, new_yseq, model_chosen, ebao_dic) new_term_list += ent_list if sen_ent_colored == '': sen_ent_colored = line # ner_lines += '<p>' + sen_ent_colored + '</p>' # ner_lines += '<p>' + ent_list + '</p>' ner_line += sen_ent_colored term_list += ent_list ner_lines += '<p>' + ner_line + '</p>' ner_lines += '<p>' + term_list + '</p>' ner_lines += '<br/>' return ner_lines, new_term_list
def __instances(self,fileRead, wordVectors, windowSize, useManualFeature): xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList() defval = u'' dataset = codecs.open(fileRead, 'r', 'utf-8') for line in dataset: i = 0 tokens = [] labels = [] tokensWithLabels = line.rstrip().split(' ') # currentTime = calendar.timegm(time.gmtime()) # if currentTime - instances.lastTimePrintedMsg > 30.0: # instances.lastTimePrintedMsg = currentTime # logger.info("Processing File. Memory usage: " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) for token in tokensWithLabels: if token.isspace() or not token: continue t = token.rsplit(self.__tokenLabelSeparator,1) if len(t[1]) == 0: logging.getLogger("Logger").warn("It was not found the label from "\ "the token " + token + ". We give to this token "\ " a label equal to"\ " the tokenLabelSeparator( " + self.__tokenLabelSeparator +")" ) t[1] = self.__tokenLabelSeparator try: tokens.append(t[0]) labels.append(t[1]) except Exception: print t print line halfWindowSize = windowSize / 2 for i in range(len(tokens)): beginIndex = i - halfWindowSize item = crfsuite.Item() if useManualFeature: item.append(crfsuite.Attribute(self.__createFeature("num" , str(int(re.search('\d', tokens[i]) is not None))))) item.append(crfsuite.Attribute(self.__createFeature("cap" , str(any(c.isupper() for c in tokens[i]))))) item.append(crfsuite.Attribute(self.__createFeature("hyp" , str(int(re.search('-', tokens[i]) is not None))))) # prefixos item.append(crfsuite.Attribute(self.__createFeature("p1", tokens[i][0] if len(tokens[i]) >= 1 else defval))) item.append(crfsuite.Attribute(self.__createFeature("p2", tokens[i][:2] if len(tokens[i]) >= 2 else defval))) item.append(crfsuite.Attribute(self.__createFeature("p3", tokens[i][:3] if len(tokens[i]) >= 3 else defval))) item.append(crfsuite.Attribute(self.__createFeature("p4", tokens[i][:4] if len(tokens[i]) >= 4 else defval))) # sufixos item.append(crfsuite.Attribute(self.__createFeature("s1", tokens[i][-1] if len(tokens[i]) >= 1 else defval))) item.append(crfsuite.Attribute(self.__createFeature("s2", tokens[i][-2:] if len(tokens[i]) >= 2 else defval))) item.append(crfsuite.Attribute(self.__createFeature("s3", tokens[i][-3:] if len(tokens[i]) >= 3 else defval))) item.append(crfsuite.Attribute(self.__createFeature("s4", tokens[i][-4:] if len(tokens[i]) >= 4 else defval))) for featureTemplate, indexFeature in itertools.izip(CRFSuite.__featuresTemplates, range(len(CRFSuite.__featuresTemplates))): namesFeature = [] valuesFeature = [] for name, index in featureTemplate: namesFeature.append(name + "[" + str(index) + "]") valuesFeature.append(tokens[i + index] if i + index >= 0 and i + index < len(tokens) else defval) names = "|".join(namesFeature) values = "|".join(valuesFeature) item.append(crfsuite.Attribute(self.__createFeature(names, values))) for j in range(windowSize): index = beginIndex + j label = str(j) + u'|' if index < 0: token = self.__startSentenceSymbol elif index >= len(tokens): token = self.__endSentenceSymbol else: token = tokens[index] for filter in self.__filters: token = filter.filter(token) k = 0 for wordvector in wordVectors: if token in wordvector: wv = wordvector[token] else: for unknownToken in self.__unknownTokens: if unknownToken in wordvector: wv = wordvector[unknownToken] break; for number in wv: item.append(crfsuite.Attribute(self.__createFeature(label + str(k),'_'), number)) k += 1 xseq.append(item) yseq.append(unicodeToSrt(labels[i])) yield xseq, tuple(yseq) xseq = crfsuite.ItemSequence() yseq = crfsuite.StringList()
def semiSupervisedProcessing(model_previous, fsamples, ie_value, ebao_dic): tagger_bp = crfsuite.Tagger() tagger_bp.open(model_previous) bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] cdd4training_semi = [] cdd4training_semi_number = 0 for line in fsamples: # 用识别的实体过滤样例 sentence, entities_in_sentence = generateSenEntities(line) new_sentence, new_entities = symbolProcess(sentence, entities_in_sentence) sentence_unicode = new_sentence.decode('utf-8') tag_seq = generateTagSeq(sentence_unicode, new_entities) feature_string, tags = generateFeature.boundaryFeatureGeneration( new_sentence, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % line xseq = crfsuite.ItemSequence() features = [] for instance in instances: fields = instance.split('\t') features.append(fields[2:]) item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_bp.set(xseq) yseq_b = tagger_bp.viterbi() length = len(yseq_b) yseq = [] for i in range(length): yseq.append(yseq_b[i]) # 标记优化处理 sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b]) sen_ent_list2, start_end_list2 = evaluation.generateEntList([tag_seq]) tagged_ents_length = len(start_end_list1[0]) if tagged_ents_length == 0: continue ents = [] selected_entity = 0 ent_index = 0 for i in range(tagged_ents_length): ent_start = start_end_list1[0][i][0] if ent_start < ent_index: continue flag = 0 ent_end = start_end_list1[0][i][1] ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8') ie_entity = 0.0 for j in range(ent_start, ent_end): ie = 0.0 # 信息熵 for ent_tag in bieso: tag_prob = tagger_bp.marginal(ent_tag, j) ie += tag_prob * math.log(1 / tag_prob, 2) ie_entity += ie # ie_ave = ie_entity / (ent_end - ent_start) # if ebao_dic.has_key(ent_content) and ie_ave > ie_value: if ie_entity > ie_value: for k in range(len(start_end_list2[0])): start_m = start_end_list2[0][k][0] end_m = start_end_list2[0][k][1] if ent_start >= start_m and ent_end <= end_m: # if end_m - start_m < 3: break ents.append( Entity( sentence_unicode[start_m:end_m].encode( 'utf-8'), int(start_m), int(end_m), 'entity')) ent_index = end_m flag = 1 break if flag == 0: continue if not ebao_dic.has_key(ent_content): continue ents.append( Entity(ent_content, int(ent_start), int(ent_end), 'entity')) ent_index = end_m selected_entity += 1 if selected_entity == 0: continue char_entity_tag_list = generateFeature.getCharEntityFPTag( sentence_unicode, ents, '1') char_entity_tag_list = generateFeature.getCharEntityPartialTag( char_entity_tag_list) new_feature_str = '' for j in range(length): new_feature_str += '%s\t%s\n' % (char_entity_tag_list[j][1][0], '\t'.join(features[j])) cdd4training_semi.append(new_feature_str.strip()) cdd4training_semi_number += 1 return cdd4training_semi, cdd4training_semi_number
def predictClassAfterBoundaryAndEval(boundary_result, sentence_list, sen_tags_list, classmodel_file, ebao_dic, post_processing, texttype): tagger = crfsuite.Tagger() tagger.open(classmodel_file) result_tags_list = evaluation.generateTagList(boundary_result) # 1是系统结果,2是标准数据 sen_ent_list1, start_end_list1 = evaluation.generateEntList( result_tags_list) # 只有一个entity类 sen_ent_list2, start_end_list2 = evaluation.generateEntList( sen_tags_list) # 多个类别 length = len(sen_ent_list1) new_sen_ent_list1 = [] sen_inside_ent_list = [] for i in range(length): # 生成对应的实体数组 sentence = sentence_list[i] sentence_unicode = sentence.decode('utf-8') entities = [] new_entities = [] s_e_list = [] if len(sen_ent_list1[i]) == 0: sen_inside_ent_list.append([['']]) new_sen_ent_list1.append(sen_ent_list1[i]) continue for j in range(len(sen_ent_list1[i])): ent_start = sen_ent_list1[i][j][0] ent_end = sen_ent_list1[i][j][1] ent_type = sen_ent_list1[i][j][2] ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8') entities.append(Entity(ent_content, ent_start, ent_end, ent_type)) s_e_list.append([sentence, ent_content]) sen_inside_ent_list.append(s_e_list) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( sentence, entities, ebao_dic, texttype) instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger.set(xseq) yseq = tagger.viterbi() ents_type.append(yseq[0]) # postProcessing if post_processing == '1': # 评价中的start_end_list没有调整 new_yseq = ['O' for i in range(len(sentence_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos enttype = ents_type[j] if start + 1 == end: new_yseq[start] = 'S-' + enttype continue new_yseq[start] = 'B-' + enttype for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + enttype new_yseq[end - 1] = 'E-' + enttype new_yseq1 = postProcessing.twoProcessings(sentence_unicode, new_yseq, ebao_dic, texttype) tag_list1 = [] tag_list1.append(new_yseq1) ents1, s_e_list1 = evaluation.generateEntList(tag_list1) new_entities = ents1[0] else: for k in range(len(ents_type)): try: new_entities.append((sen_ent_list1[i][k][0], sen_ent_list1[i][k][1], ents_type[k])) except Exception as e: print e print len(sen_ent_list1[i]), len(ents_type) print sentence print feature_c new_sen_ent_list1.append(new_entities) # 错误分析 ent_count_result, ent_count_result_o = evaluation.countEntList( new_sen_ent_list1, sen_ent_list2, start_end_list1, start_end_list2, sen_inside_ent_list) evaluation.measurePRF(ent_count_result_o)
def getNerResult(inputstring, tagger_b, tagger_c, bieso): # inputstring = unicode(inputstring) # inputsentence = tools.uniformSignal(inputstring.encode('utf8')) lines = tools.sentence_split(inputstring) ent_list = '' for line in lines: line = line.strip() # 去除标签部分,以<开头且以>结尾的过滤 #if line == '' or line[0] == '<' and line[-1] == '>' : continue if line == '': continue # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_b.set(xseq) yseq_b = tagger_b.viterbi() prob_b = tagger_b.probability(yseq_b) line_unicode = line.decode('utf-8') model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 sentence = line entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = sentence.decode('utf-8')[ent_start:ent_end].encode( 'utf-8') entities.append(Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( sentence, entities, ebao_dic, '') instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_c.set(xseq) yseq_c = tagger_c.viterbi() ents_type.append(yseq_c[0]) new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos if start + 1 == end: new_yseq[start] = 'S-' + ents_type[j] continue new_yseq[start] = 'B-' + ents_type[j] for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + ents_type[j] new_yseq[end - 1] = 'E-' + ents_type[j] ents = generateNerInSentence(line_unicode, new_yseq, model_chosen, ebao_dic) ent_list += ents return ent_list