def generateFullTagFile(full_tag_file, boundary4training, class4training, sen_ent4developing, not4train, ebao_dic, datatype, tag_strategy, texttype): f = open(full_tag_file, 'r') lines = f.readlines() f.close() fw_b = open(boundary4training, 'w') fw_c = open(class4training, 'w') if not4train == '1': fw_e = open(sen_ent4developing, 'w') sen_tags = [] sentences = [] for line in lines: line = line.replace('\n', '') # 保留原句中的起始空格 line = line.replace('\r', '') line = line.replace(' ', ' ') try: sentence, entities_in_sentence = generateSenEntities( line, texttype) # 替换全角空格 except Exception as e: print line print sentence continue # 过滤训练数据的ds中整个句子标为一个实体的例子 if datatype == 'train': if len(entities_in_sentence) == 1 and entities_in_sentence[ 0].content == sentence and entities_in_sentence[ 0].type == 'specifications': continue # 增加符号替换及空格处理 #(该部分操作重复,在分词的时候做了该处理,不过该操作在加字典特征的时候起到了作用) new_sentence, new_entities = symbolProcess(sentence, entities_in_sentence) if len(new_entities) == 0: continue # 类别评价 feature_b, tags_in_sen = generateFeature.boundaryFeatureGeneration( new_sentence, new_entities, ebao_dic, 'full', tag_strategy) fw_b.write(feature_b) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( new_sentence, new_entities, ebao_dic, texttype) fw_c.write(feature_c) if not4train == '1': fw_e.write(sen_ent4error) sentences.append(new_sentence.replace('\r', '')) sen_tags.append(tags_in_sen) fw_b.close() fw_c.close() if not4train == '1': fw_e.close() print sen_ent4developing + 'generated!' print boundary4training + ' generated!' print class4training + ' generated!' if not4train == '1': return sentences, sen_tags
def selectActiveData(unselected, selected, model, num): select = [] tagger_bp = crfsuite.Tagger() tagger_bp.open(model) bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] entropy = [] for line in unselected: sentence, entities_in_sentence = processing.generateSenEntities( line, '') new_sentence, new_entities = processing.symbolProcess( sentence, entities_in_sentence) sentence_unicode = new_sentence.decode('utf-8') tag_seq = processing.generateTagSeq(sentence_unicode, new_entities) feature_string, tags = generateFeature.boundaryFeatureGeneration( new_sentence, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % line xseq = crfsuite.ItemSequence() features = [] for instance in instances: fields = instance.split('\t') features.append(fields[2:]) item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_bp.set(xseq) yseq_b = tagger_bp.viterbi() length = len(yseq_b) yseq = [] ie_entity = 0.0 for i in range(length): yseq.append(yseq_b[i]) for j in range(len(yseq)): ie = 0.0 # 信息熵 for ent_tag in bieso: try: tag_prob = tagger_bp.marginal(ent_tag, j) ie += tag_prob * math.log(1 / tag_prob, 2) except Exception, e: print line exit(0) ie_entity += ie entropy.append((line, ie_entity))
def generatePartialTagFile(partial_tag_file, ebao_dic, tag_strategy, sen_num): f = open(partial_tag_file, 'r') lines = f.readlines() f.close() predict_value_dic = {} sentence_num = 0 for line in lines: if sentence_num > sen_num: break sentence_num += 1 line = line.replace('\n', '') # 保留原句中的起始空格 line = line.replace('\r', '') line = line.replace(' ', ' ') try: sentence, entities_in_sentence = generateSenEntities(line) # 过滤实体:全英文、长度为1、实体类型不对http: if len(entities_in_sentence) == 0: continue feature_string, tags_in_sen = generateFeature.boundaryFeatureGeneration( sentence, entities_in_sentence, ebao_dic, 'partial', tag_strategy) # S3tag\tS1tag\tFeatures predict_value = predictValue(feature_string) except Exception as e: print e print sentence continue if feature_string == None: print 'None: %s' % line continue predict_value_dic[sentence + '\t\t\t' + feature_string.strip()] = predict_value # sort predict_value_dic sorted_predict_value_dic = sorted(predict_value_dic.iteritems(), key=lambda d: d[1], reverse=True) sample_feature_list = [] for key_value in sorted_predict_value_dic: print str(key_value[1]) sen_feature = key_value[0].split('\t\t\t') print sen_feature[0] + '\n' sample_feature_list.append(sen_feature[1]) return sample_feature_list
def generateBoundaryTagFile(source, target_file, ebao_dic, not4train, sen_ent4developing): if isinstance(source, list): lines = source else: f = open(source, 'r') lines = f.readlines() f.close() fw_b = open(target_file, 'w') if not4train == '1': fw_e = open(sen_ent4developing, 'w') sen_tags = [] sentences = [] for line in lines: line = line.replace('\n', '') # 保留原句中的起始空格 line = line.replace('\r', '') line = line.replace(' ', ' ') try: sentence, entities_in_sentence = generateSenEntities(line, '') # 替换全角空格 except Exception as e: print line print sentence continue new_sentence, new_entities = symbolProcess(sentence, entities_in_sentence) if len(new_entities) == 0: continue # 类别评价 feature_b, tags_in_sen = generateFeature.boundaryFeatureGeneration( new_sentence, new_entities, ebao_dic, 'full', '') fw_b.write(feature_b) if not4train == '1': feature_c, sen_ent4error = generateFeature.classFeatureGeneration( new_sentence, new_entities, ebao_dic, '') fw_e.write(sen_ent4error) sentences.append(new_sentence.replace('\r', '')) sen_tags.append(tags_in_sen) fw_b.close() print target_file + ' generated!' if not4train == '1': fw_e.close() print sen_ent4developing + 'generated!' return sentences, sen_tags
def mainfunction(sen, postProcess, texttype, index): model_b = os.path.join(root, './models/boundarymodel-' + index) model_c = os.path.join(root, './models/classmodel-' + index) ner_lines = '' tagger_b = crfsuite.Tagger() tagger_b.open(model_b) tagger_c = crfsuite.Tagger() tagger_c.open(model_c) bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] line = sen.strip() # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_b.set(xseq) yseq_b = tagger_b.viterbi() line_unicode = line.decode('utf-8') model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 entities = [] new_entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = line_unicode[ent_start:ent_end].encode('utf-8') entities.append(Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( line, entities, ebao_dic, texttype) instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_c.set(xseq) yseq_c = tagger_c.viterbi() ents_type.append(yseq_c[0]) # postProcessing new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos enttype = ents_type[j] if start + 1 == end: new_yseq[start] = 'S-' + enttype continue new_yseq[start] = 'B-' + enttype for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + enttype new_yseq[end - 1] = 'E-' + enttype if postProcess == '1': # 评价中的start_end_list没有调整 new_yseq = postProcessing.twoProcessings(line_unicode, new_yseq, ebao_dic, texttype) ents1, s_e_list1 = evaluation.generateEntList([new_yseq]) new_entities = ents1[0] entity_list = '' length = len(new_entities) for i in range(length): content = line_unicode[new_entities[i][0]:new_entities[i][1]] enttype = new_entities[i][2] if enttype == '': print line_unicode.encode('utf8'), line_unicode[ new_entities[i][0]:new_entities[i][1]].encode('utf8') entity_list += content.encode( 'utf8') + '[' + en_cn_dic[enttype] + ']\n' return entity_list, new_yseq
def mainfunction(inputstring, taggerb, taggerc): if inputstring == '': sentence_ner = '请输入句子' return sentence_ner # 一些句子预处理 inputsentence = tools.uniformSignal(inputstring) ner_lines = '' bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] new_term_list = '' for single_line in inputsentence.split('\n'): lines = tools.sentence_split(single_line) ner_line = '' term_list = '' for line in lines: line = line.strip() # 去除标签部分,以<开头且以>结尾的过滤 if line == '' or line[0] == '<' and line[-1] == '>': continue # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) taggerb.set(xseq) yseq_b = taggerb.viterbi() prob_b = taggerb.probability(yseq_b) line_unicode = line.decode('utf-8') # for t, y in enumerate(yseq_b): # # Output the predicted labels with their marginal probabilities. # ner_line += '%s:%f\n' % (y, taggerb.marginal(y, t)) model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList( [yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 sentence = line entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = sentence.decode( 'utf-8')[ent_start:ent_end].encode('utf-8') entities.append( Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( sentence, entities, ebao_dic, texttype) instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) taggerc.set(xseq) yseq_c = taggerc.viterbi() ents_type.append(yseq_c[0]) new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos if start + 1 == end: new_yseq[start] = 'S-' + ents_type[j] continue new_yseq[start] = 'B-' + ents_type[j] for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + ents_type[j] new_yseq[end - 1] = 'E-' + ents_type[j] sen_ent_colored, ent_list = generateNerInSentence( line_unicode, new_yseq, model_chosen, ebao_dic) new_term_list += ent_list if sen_ent_colored == '': sen_ent_colored = line # ner_lines += '<p>' + sen_ent_colored + '</p>' # ner_lines += '<p>' + ent_list + '</p>' ner_line += sen_ent_colored term_list += ent_list ner_lines += '<p>' + ner_line + '</p>' ner_lines += '<p>' + term_list + '</p>' ner_lines += '<br/>' return ner_lines, new_term_list
def getNerResult(inputstring, tagger_b, tagger_c, bieso): # inputstring = unicode(inputstring) # inputsentence = tools.uniformSignal(inputstring.encode('utf8')) lines = tools.sentence_split(inputstring) ent_list = '' for line in lines: line = line.strip() # 去除标签部分,以<开头且以>结尾的过滤 #if line == '' or line[0] == '<' and line[-1] == '>' : continue if line == '': continue # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_b.set(xseq) yseq_b = tagger_b.viterbi() prob_b = tagger_b.probability(yseq_b) line_unicode = line.decode('utf-8') model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 sentence = line entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = sentence.decode('utf-8')[ent_start:ent_end].encode( 'utf-8') entities.append(Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( sentence, entities, ebao_dic, '') instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_c.set(xseq) yseq_c = tagger_c.viterbi() ents_type.append(yseq_c[0]) new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos if start + 1 == end: new_yseq[start] = 'S-' + ents_type[j] continue new_yseq[start] = 'B-' + ents_type[j] for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + ents_type[j] new_yseq[end - 1] = 'E-' + ents_type[j] ents = generateNerInSentence(line_unicode, new_yseq, model_chosen, ebao_dic) ent_list += ents return ent_list
def semiSupervisedProcessing(model_previous, fsamples, ie_value, ebao_dic): tagger_bp = crfsuite.Tagger() tagger_bp.open(model_previous) bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] cdd4training_semi = [] cdd4training_semi_number = 0 for line in fsamples: # 用识别的实体过滤样例 sentence, entities_in_sentence = generateSenEntities(line) new_sentence, new_entities = symbolProcess(sentence, entities_in_sentence) sentence_unicode = new_sentence.decode('utf-8') tag_seq = generateTagSeq(sentence_unicode, new_entities) feature_string, tags = generateFeature.boundaryFeatureGeneration( new_sentence, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % line xseq = crfsuite.ItemSequence() features = [] for instance in instances: fields = instance.split('\t') features.append(fields[2:]) item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) tagger_bp.set(xseq) yseq_b = tagger_bp.viterbi() length = len(yseq_b) yseq = [] for i in range(length): yseq.append(yseq_b[i]) # 标记优化处理 sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b]) sen_ent_list2, start_end_list2 = evaluation.generateEntList([tag_seq]) tagged_ents_length = len(start_end_list1[0]) if tagged_ents_length == 0: continue ents = [] selected_entity = 0 ent_index = 0 for i in range(tagged_ents_length): ent_start = start_end_list1[0][i][0] if ent_start < ent_index: continue flag = 0 ent_end = start_end_list1[0][i][1] ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8') ie_entity = 0.0 for j in range(ent_start, ent_end): ie = 0.0 # 信息熵 for ent_tag in bieso: tag_prob = tagger_bp.marginal(ent_tag, j) ie += tag_prob * math.log(1 / tag_prob, 2) ie_entity += ie # ie_ave = ie_entity / (ent_end - ent_start) # if ebao_dic.has_key(ent_content) and ie_ave > ie_value: if ie_entity > ie_value: for k in range(len(start_end_list2[0])): start_m = start_end_list2[0][k][0] end_m = start_end_list2[0][k][1] if ent_start >= start_m and ent_end <= end_m: # if end_m - start_m < 3: break ents.append( Entity( sentence_unicode[start_m:end_m].encode( 'utf-8'), int(start_m), int(end_m), 'entity')) ent_index = end_m flag = 1 break if flag == 0: continue if not ebao_dic.has_key(ent_content): continue ents.append( Entity(ent_content, int(ent_start), int(ent_end), 'entity')) ent_index = end_m selected_entity += 1 if selected_entity == 0: continue char_entity_tag_list = generateFeature.getCharEntityFPTag( sentence_unicode, ents, '1') char_entity_tag_list = generateFeature.getCharEntityPartialTag( char_entity_tag_list) new_feature_str = '' for j in range(length): new_feature_str += '%s\t%s\n' % (char_entity_tag_list[j][1][0], '\t'.join(features[j])) cdd4training_semi.append(new_feature_str.strip()) cdd4training_semi_number += 1 return cdd4training_semi, cdd4training_semi_number