def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir): # load feature mapping and label mapping feature_map = load_map(featurefile) label_map = load_map(labelfile) reader = MentionReader(json_file) ner_feature = NERFeature(is_train=False, brown_file=brown_file, feature_mapping=feature_map, label_mapping=label_map) count = 0 gx = open(outdir+'/test_x.txt', 'w') gy = open(outdir+'/test_y.txt', 'w') print 'start test feature generation' while reader.has_next(): if count%1000 == 0: print count sentence = reader.next() for mention in sentence.mentions: try: m_id = '%s_%d_%d_%d'%(sentence.fileid, sentence.senid, mention.start, mention.end) feature_ids, label_ids = ner_feature.extract(sentence, mention) if len(label_ids)>0: gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n') gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n') count += 1 except Exception as e: print e.message, e.args print sentence.fileid, sentence.senid print sentence continue print count reader.close() gx.close() gy.close()
def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir): # load feature mapping and label mapping feature_map = load_map(featurefile) label_map = load_map(labelfile) reader = MentionReader(json_file) ner_feature = NERFeature(is_train=False, brown_file=brown_file, feature_mapping=feature_map, label_mapping=label_map) count = 0 gx = open(outdir + '/test_x.txt', 'w') gy = open(outdir + '/test_y.txt', 'w') ''' map from mention_id to text, saved in "mention_text.map" ''' gz = open(outdir + '/mention_text.map', 'w') print 'start test feature generation' while reader.has_next(): if count % 1000 == 0: print count sentence = reader.next() print '!!!!!!' for mention in sentence.mentions: print '@@@@@!!!!!!' try: m_id = '%s_%d_%d_%d' % (sentence.fileid, sentence.senid, mention.start, mention.end) print '$$$$$!!!!!!' feature_ids, label_ids = ner_feature.extract(sentence, mention) if len(label_ids) > 0: print '^^^^^^^!!!!!!' gx.write(m_id + '\t' + ','.join([str(x) for x in feature_ids]) + '\n') gy.write(m_id + '\t' + ','.join([str(x) for x in label_ids]) + '\n') if mention.start >= 0: print '*******!!!!!!' sent = sentence.get_orig_text() c_start = mention.get_c_start() c_end = mention.get_c_end() gz.write(m_id+'\t'+mention.get_entity()+'\t'+ \ str(mention.start) +'\t'+ str(mention.end)+ '\t' + \ str(c_start) + '\t' + str(c_end) + '\t' + \ sentence.get_text() + '\n') count += 1 except Exception as e: print e.message, e.args print sentence.fileid, sentence.senid print sentence continue print count reader.close() gx.close() gy.close()
def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir, requireEmType, isEntityMention): # load feature mapping and label mapping feature_map = load_map(featurefile) label_map = load_map(labelfile) reader = MentionReader(json_file) ner_feature = NERFeature(is_train=False, brown_file=brown_file, requireEmType=requireEmType, isEntityMention=isEntityMention, feature_mapping=feature_map, label_mapping=label_map) count = 0 gx = open(outdir + '/test_x.txt', 'w') gy = open(outdir + '/test_y.txt', 'w') print 'start test feature generation' while reader.has_next(): if count % 10000 == 0 and count != 0: sys.stdout.write('process ' + str(count) + ' lines\r') sys.stdout.flush() sentence = reader.next() if isEntityMention: mentions = sentence.entityMentions else: mentions = sentence.relationMentions for mention in mentions: try: if isEntityMention: m_id = '%s_%s_%d_%d' % (sentence.articleId, sentence.sentId, mention.start, mention.end) else: label1 = sentence.get_em_text(mention.em1Start, mention.em1End) label2 = sentence.get_em_text(mention.em2Start, mention.em2End) m_id = '%s_%s_%s_%s' % (label1, label2, sentence.articleId, sentence.sentId) # m_id = '%s_%d_%d_%d_%d_%d'%(sentence.articleId, sentence.sentId, mention.em1Start, mention.em1End, mention.em2Start, mention.em2End) #print mention.em1Start, mention.em1End, mention.em2Start, mention.em2End feature_ids, label_ids = ner_feature.extract(sentence, mention) gx.write(m_id + '\t' + ','.join([str(x) for x in feature_ids]) + '\n') gy.write(m_id + '\t' + ','.join([str(x) for x in label_ids]) + '\n') count += 1 except Exception as e: print e.message, e.args print sentence.articleId, sentence.sentId print mention continue type_test = open(outdir + '/type_test.txt', 'w') write_map(ner_feature.label_mapping, type_test) print '\n' reader.close() gx.close() gy.close() print(ner_feature.lc)
def pipeline(json_file, brown_file, outdir): reader = MentionReader(json_file) ner_feature = NERFeature(is_train=True, brown_file=brown_file) count = 0 gx = open(outdir+'/train_x.txt', 'w') gy = open(outdir+'/train_y.txt', 'w') f = open(outdir+'/feature.map', 'w') t = open(outdir+'/type.txt', 'w') print 'start train feature generation' mention_count = 0 while reader.has_next(): if count%10000 == 0: print count sentence = reader.next() for mention in sentence.mentions: try: m_id = '%s_%d_%d_%d'%(sentence.fileid, sentence.senid, mention.start, mention.end) feature_ids, label_ids = ner_feature.extract(sentence, mention) gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n') gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n') mention_count += 1 count += 1 except Exception as e: print e.message, e.args print sentence.fileid, sentence.senid, len(sentence.tokens) print mention print 'mention :%d'%mention_count print 'feature :%d'%len(ner_feature.feature_mapping) print 'label :%d'%len(ner_feature.label_mapping) write_map(ner_feature.feature_mapping, f) write_map(ner_feature.label_mapping, t) reader.close() gx.close() gy.close() f.close() t.close()
def pipeline(json_file, brown_file, outdir, requireEmType, isEntityMention): reader = MentionReader(json_file) ner_feature = NERFeature(is_train=True, brown_file=brown_file, requireEmType=requireEmType, isEntityMention=isEntityMention, feature_mapping={}, label_mapping={}) count = 0 gx = open(outdir+'/train_x.txt', 'w') gy = open(outdir+'/train_y.txt', 'w') f = open(outdir+'/feature.map', 'w') t = open(outdir+'/type.txt', 'w') label_counts_file = open(outdir+'/label_counts.txt', 'w') print 'start train feature generation' mention_count = 0 mentionCountByNumOfLabels = {} while reader.has_next(): if count%10000 == 0: sys.stdout.write('process ' + str(count) + ' lines\r') sys.stdout.flush() sentence = reader.next() if isEntityMention: mentions = sentence.entityMentions else: mentions = sentence.relationMentions for mention in mentions: try: if isEntityMention: m_id = '%s_%s_%d_%d'%(sentence.articleId, sentence.sentId, mention.start, mention.end) else: m_id = '%s_%d_%d_%d_%d_%d'%(sentence.articleId, sentence.sentId, mention.em1Start, mention.em1End, mention.em2Start, mention.em2End) feature_ids, label_ids = ner_feature.extract(sentence, mention) if len(label_ids) not in mentionCountByNumOfLabels: mentionCountByNumOfLabels[len(label_ids)] = 1 else: mentionCountByNumOfLabels[len(label_ids)] += 1 gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n') gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n') mention_count += 1 count += 1 except Exception as e: print e.message, e.args print sentence.articleId, sentence.sentId, len(sentence.tokens) print mention raise print '\n' print 'mention :%d'%mention_count print 'feature :%d'%len(ner_feature.feature_mapping) print 'label :%d'%len(ner_feature.label_mapping) sorted_map = sorted(mentionCountByNumOfLabels.items(),cmp=lambda x,y:x[0]-y[0]) for item in sorted_map: label_counts_file.write(str(item[0])+'\t'+str(item[1])+'\n') write_map(ner_feature.feature_mapping, f) write_map(ner_feature.label_mapping, t) reader.close() gx.close() gy.close() f.close() t.close()
def pipeline(json_file, brown_file, outdir): reader = MentionReader(json_file) ner_feature = NERFeature(is_train=True, brown_file=brown_file) count = 0 gx = open(outdir + '/train_x.txt', 'w') gy = open(outdir + '/train_y.txt', 'w') gd = open(outdir + '/mention_reader_debug.txt', 'w') f = open(outdir + '/feature.map', 'w') t = open(outdir + '/type.txt', 'w') print 'start train feature generation' mention_count = 0 while reader.has_next(): if count % 10000 == 0: print count sentence = reader.next() gd.write(str(sentence) + '\n') for mention in sentence.mentions: try: m_id = '%s_%d_%d_%d' % (sentence.fileid, sentence.senid, mention.start, mention.end) feature_ids, label_ids = ner_feature.extract(sentence, mention) gx.write(m_id + '\t' + ','.join([str(x) for x in feature_ids]) + '\n') gy.write(m_id + '\t' + ','.join([str(x) for x in label_ids]) + '\n') mention_count += 1 count += 1 except Exception as e: print e.message, e.args print sentence.fileid, sentence.senid, len(sentence.tokens) print mention print 'mention :%d' % mention_count print 'feature :%d' % len(ner_feature.feature_mapping) print 'label :%d' % len(ner_feature.label_mapping) write_map(ner_feature.feature_mapping, f) write_map(ner_feature.label_mapping, t) reader.close() gx.close() gy.close() f.close() t.close()
def pipeline_qa(json_file, brown_file, featurefile, labelfile, outdir, requireEmType, isEntityMention): feature_map = load_map_qa(featurefile) label_map = load_map(labelfile) reader = MentionReader(json_file) ner_feature = NERFeature(is_train=True, brown_file=brown_file, requireEmType=requireEmType, isEntityMention=isEntityMention, feature_mapping=feature_map, label_mapping=label_map) ner_feature.feature_count = len(feature_map) count = 0 gx = open(outdir + '/qa_x.txt', 'w') gy = open(outdir + '/qa_y.txt', 'w') f = open(outdir + '/feature.map', 'w') #t = open(outdir+'/type.txt', 'w') print 'start qa em pairs feature generation' mention_count = 0 mentionCountByNumOfLabels = {} question2mentions = {} while reader.has_next(): if count % 10000 == 0: sys.stdout.write('process ' + str(count) + ' lines\r') sys.stdout.flush() sentence = reader.next() question = sentence.articleId sentLabel = sentence.label assert sentLabel != None if isEntityMention: mentions = sentence.entityMentions else: mentions = sentence.relationMentions if sentLabel == 'pos': assert len(mentions) == 1 for mention in mentions: try: if isEntityMention: m_id = '%s_%s_%d_%d' % (sentence.articleId, sentence.sentId, mention.start, mention.end) else: m_id = '%s_%d_%d_%d_%d_%d' % ( sentence.articleId, sentence.sentId, mention.em1Start, mention.em1End, mention.em2Start, mention.em2End) if question not in question2mentions: question2mentions[question] = {} if sentLabel in question2mentions[question]: question2mentions[question][sentLabel].add(m_id) else: question2mentions[question][sentLabel] = set([m_id]) feature_ids, label_ids = ner_feature.extract(sentence, mention) if len(label_ids) not in mentionCountByNumOfLabels: mentionCountByNumOfLabels[len(label_ids)] = 1 else: mentionCountByNumOfLabels[len(label_ids)] += 1 gx.write(m_id + '\t' + ','.join([str(x) for x in feature_ids]) + '\n') gy.write(m_id + '\t' + ','.join([str(x) for x in label_ids]) + '\n') mention_count += 1 count += 1 except Exception as e: print e.message, e.args print sentence.articleId, sentence.sentId, len(sentence.tokens) print mention raise print '\n' print 'mention :%d' % mention_count print 'feature :%d' % len(ner_feature.feature_mapping) print 'label :%d' % len(ner_feature.label_mapping) sorted_map = sorted(mentionCountByNumOfLabels.items(), cmp=lambda x, y: x[0] - y[0]) write_map(ner_feature.feature_mapping, f) #write_map(ner_feature.label_mapping, t) reader.close() gx.close() gy.close() f.close() #t.close() qa_pair = open(outdir + '/qa_pair.txt', 'w') qa_mpair = open(outdir + '/qa_mpair.txt', 'w') question_map = open(outdir + '/question.txt', 'w') qid = 0 for question in question2mentions: if len(question2mentions[question]) < 2: continue for mid in question2mentions[question]['pos']: qa_pair.write(mid + '\t' + str(qid) + '\t1.0\n') for mid in question2mentions[question]['neg']: qa_pair.write(mid + '\t' + str(qid) + '\t0.0\n') question_map.write(question + '\t' + str(qid) + '\n') qid += 1 for mid1 in question2mentions[question]['pos']: for mid2 in question2mentions[question]['pos']: if mid1 == mid2: continue qa_mpair.write(mid1 + '\t' + mid2 + '\t1\n') for mid2 in question2mentions[question]['neg']: qa_mpair.write(mid1 + '\t' + mid2 + '\t0\n') qa_mpair.close() qa_pair.close() question_map.close()
def pipeline_qa(json_file, brown_file, featurefile, labelfile, outdir, requireEmType, isEntityMention): feature_map = load_map_qa(featurefile) label_map = load_map(labelfile) reader = MentionReader(json_file) ner_feature = NERFeature(is_train=True, brown_file=brown_file, requireEmType=requireEmType, isEntityMention=isEntityMention, feature_mapping=feature_map, label_mapping=label_map) ner_feature.feature_count = len(feature_map) count = 0 gx = open(outdir + '/qa_x.txt', 'w') f = open(outdir + '/feature.map', 'w') if not isEntityMention: question_file = open(outdir + '/question.txt', 'w') #t = open(outdir+'/type.txt', 'w') print 'start train feature generation' mention_count = 0 while reader.has_next(): if count % 10000 == 0: sys.stdout.write('process ' + str(count) + ' lines\r') sys.stdout.flush() sentence = reader.next() question = sentence.articleId if isEntityMention: mentions = sentence.entityMentions else: mentions = sentence.relationMentions for mention in mentions: if not isEntityMention: if sentence.questionPositions: questionPositions = ' '.join( [str(p) for p in sentence.questionPositions]) question_file.write( str(mention_count) + '\t' + question + '\t' + questionPositions + '\n') else: question_file.write( str(mention_count) + '\t' + question + '\n') try: if isEntityMention: m_id = '%s_%d_%d' % (sentence.sentId, mention.start, mention.end) else: m_id = '%d_%d_%d_%d_%d' % ( sentence.sentId, mention.em1Start, mention.em1End, mention.em2Start, mention.em2End) feature_ids, label_ids = ner_feature.extract(sentence, mention) gx.write(m_id + '\t' + ','.join([str(x) for x in feature_ids]) + '\n') mention_count += 1 count += 1 except Exception as e: print e.message, e.args print sentence.articleId, sentence.sentId, len(sentence.tokens) print mention raise print '\n' print 'mention :%d' % mention_count print 'feature :%d' % len(ner_feature.feature_mapping) print 'label :%d' % len(ner_feature.label_mapping) write_map(ner_feature.feature_mapping, f) reader.close() gx.close() f.close()