def test1(): conf = config("../../conf/question.conf") f = open(conf["title_pos"]) titles = f.readlines() indx = 0 for title in titles: if indx > 20: break naive_finder = EntityFinder(title) naive_finder.find(display=True) indx += 1
def main(): """ 主函数,-t表示什么任务 e.g. ./read_raw_data.py -t extract_title ./read_raw_data.py -t extract_title_nbest """ parser = OptionParser() parser.add_option("-t", "--task", dest="task", default="error", help="你需要选择哪个任务") parser.add_option("-s", "--store", dest="store", action="store_true", help="选择存储与否", default=False) #分析命令行参数 (options, args) = parser.parse_args() #检查错误 print(options) if options.task == "error": print("请选择任务") sys.exit(1) #得到如何往文件里面写的格式 task_function = get_task_function(options.task) #得到注册文件 qconf = config("../../conf/question.conf") #得到将要写入的文件名 extract_file = get_extract_file(options.task, qconf) #进行抽取 store = None if options.store == True: store = qconf["filter_qa"] extract(qconf["car_pos"], extract_file, task_function, min_answer_count=10, pass_filter=word_counts_filter, store_file=store)
def main(): """ 主函数,-t表示什么任务 e.g. ./read_raw_data.py -t extract_title ./read_raw_data.py -t extract_title_nbest """ parser = OptionParser() parser.add_option("-t", "--task",dest="task",default="error",help="你需要选择哪个任务") parser.add_option("-s", "--store",dest="store",action="store_true",help="选择存储与否",default=False) #分析命令行参数 (options, args) = parser.parse_args() #检查错误 print(options) if options.task == "error": print("请选择任务") sys.exit(1) #得到如何往文件里面写的格式 task_function = get_task_function(options.task) #得到注册文件 qconf = config("../../conf/question.conf") #得到将要写入的文件名 extract_file = get_extract_file(options.task,qconf) #进行抽取 store = None if options.store == True: store = qconf["filter_qa"] extract(qconf["car_pos"],extract_file,task_function,min_answer_count=10,pass_filter=word_counts_filter,store_file=store)
#!/usr/bin/python3 #coding=utf-8 import sys import pickle from question_table import question_table sys.path.append("..") import insummer from insummer.read_conf import config from insummer.util import NLP from insummer.query_expansion.entity_finder import NgramEntityFinder #获得两个问题集的路径信息,并读取 ques_conf = config('../../conf/question.conf') filter_path = ques_conf['filter_qa'] duc_path = ques_conf['duc_question'] fil_spath = ques_conf['filter_statistic'] duc_spath = ques_conf['duc_statistic'] nlp = NLP() #获得两个语料的问题集 finfile = open(filter_path, 'rb') fil_data = pickle.load(finfile) dinfile = open(duc_path, 'rb') duc_data = pickle.load(dinfile)
#!/usr/bin/python3 #coding=utf-8 ''' 这个主要是将抽取的语料与duc语料的整体统计特征做一个直观的输出比较, 整体代码糙的不行,全赖问题结构固定,先这么将就着看吧。 ''' import pickle import sys from question_table import question_table sys.path.append('..') from insummer.read_conf import config question_conf = config('../../conf/question.conf') fil_path = question_conf['filter_statistic'] duc_path = question_conf['duc_statistic'] infile = open(fil_path,'rb') fil_table = pickle.load(infile) infile = open(duc_path,'rb') duc_table = pickle.load(infile) def get_total_avg(duc_list,isavg,nq): qa_total = 0 entitle_total = 0 wdtitle_total = 0 enanser_total = 0
import sys sys.path.append("..") import insummer from insummer.read_conf import config from insummer.knowledge_base import concept_tool from insummer.knowledge_base.relation import relation_tool import pickle from abc import ABCMeta, abstractmethod #others import csv from optparse import OptionParser conf = config("../../conf/cn_data.conf") data_pos = conf["csv_pos"] part = [i for i in range(0, 8)] cn_tool = concept_tool() rel_tool = relation_tool() #得到第i份part的名字 def get_ipart_name(i): return "%spart_0%s.csv" % (data_pos, part[i]) def get_ipart_handler(i):
''' 这个文件的主要作用是统计relation的数据 ''' import sys sys.path.append("..") import insummer from insummer.read_conf import config from insummer.knowledge_base import concept_tool from insummer.knowledge_base.relation import relation_tool #others import csv conf = config("../../conf/cn_data.conf") data_pos = conf["csv_pos"] part = [i for i in range(0,8)] cp_tool = concept_tool() rel_tool = relation_tool() #得到第i份part的名字 def get_ipart_name(i): return "%spart_0%s.csv"%(data_pos,part[i]) def get_ipart_handler(i): assert int(i) in part
def test1(): conf = config("../../conf/question.conf") f = open(conf["computer_pos"]) indx = 0 title = "" nbest = [] answer_count = -1 author = "" questions = [] line = f.readline() question_indx = 0 while len(line) > 0: #先去除line两边的空格和最后结尾的逗号 line = line.strip() if line[-1] == ',': line = line[:-1] #把json都装载进来 try: line_json = json.loads(line) except: print(line) sys.exit(1) #判断是answer还是question if "answercount" in line_json: #是问题 #先把上一个问题的存了 #content为空,best空 #如果nbest为空,说明还没有人回答过, 那么不处理 if len(nbest) > 0: m_question = Question(title, "", "", nbest, author, answer_count) #m_question.print() #questions.append(m_question) question_indx += 1 if question_indx % 100 == 0: print("question indx", question_indx) #重新计数 if len(line_json["answercount"].strip()) > 0: answer_count = int(line_json["answercount"]) else: answer_count = 0 title, nbest, author = "", [], "" #现在开始重新存 title = line_json["subject"] elif "content" in line_json: content = line_json["content"] support = int(line_json["supportnum"]) oppose = int(line_json["opposenum"]) ans_author = line_json["answeruser"] emp_answer = Answer(content, support, oppose, ans_author) nbest.append(emp_answer) else: print("error") sys.exit(1) indx += 1 line = f.readline() if indx % 1000 == 0: print("indx", indx) m_question = Question(title, "", "", nbest, author, answer_count)
#!/usr/bin/python3 ''' 这个文件主要测试载入数据 ''' import sys sys.path.append("..") import insummer from insummer.read_conf import config import pickle qconf = config("../../conf/question.conf") def get_data(): data_dir = qconf["filter_qa"] f = open(data_dir,'rb') data = pickle.load(f) return data def get_duc(): duc_dir = qconf['duc_question'] f = open(duc_dir,'rb') data = pickle.load(f) return data if __name__ == '__main__':
def test1(): conf = config("../../conf/question.conf") f = open(conf["computer_pos"]) indx = 0 title = "" nbest = [] answer_count = -1 author = "" questions = [] line = f.readline() question_indx = 0 while len(line) > 0 : #先去除line两边的空格和最后结尾的逗号 line = line.strip() if line[-1] == ',': line = line[:-1] #把json都装载进来 try: line_json = json.loads(line) except: print(line) sys.exit(1) #判断是answer还是question if "answercount" in line_json: #是问题 #先把上一个问题的存了 #content为空,best空 #如果nbest为空,说明还没有人回答过, 那么不处理 if len(nbest) > 0: m_question = Question(title,"","",nbest,author,answer_count) #m_question.print() #questions.append(m_question) question_indx += 1 if question_indx % 100 == 0: print("question indx",question_indx) #重新计数 if len(line_json["answercount"].strip()) > 0: answer_count = int(line_json["answercount"]) else: answer_count = 0 title,nbest,author = "",[],"" #现在开始重新存 title = line_json["subject"] elif "content" in line_json: content = line_json["content"] support = int(line_json["supportnum"]) oppose = int(line_json["opposenum"]) ans_author = line_json["answeruser"] emp_answer = Answer(content,support,oppose,ans_author) nbest.append(emp_answer) else: print("error") sys.exit(1) indx += 1 line = f.readline() if indx %1000 == 0: print("indx",indx) m_question = Question(title,"","",nbest,author,answer_count)