예제 #1
0
def test1():
    conf = config("../../conf/question.conf")
    f = open(conf["title_pos"])
    titles = f.readlines()

    indx = 0
    for title in titles:
        if indx > 20:
            break

        naive_finder = EntityFinder(title)
        naive_finder.find(display=True)
        indx += 1
예제 #2
0
def test1():
    conf = config("../../conf/question.conf")
    f = open(conf["title_pos"])
    titles = f.readlines()

    indx = 0
    for title in titles:
        if indx > 20:
            break

        naive_finder = EntityFinder(title)
        naive_finder.find(display=True)
        indx += 1
예제 #3
0
def main():
    """
    主函数,-t表示什么任务
    e.g. ./read_raw_data.py -t extract_title
         ./read_raw_data.py -t extract_title_nbest
    """

    parser = OptionParser()
    parser.add_option("-t",
                      "--task",
                      dest="task",
                      default="error",
                      help="你需要选择哪个任务")
    parser.add_option("-s",
                      "--store",
                      dest="store",
                      action="store_true",
                      help="选择存储与否",
                      default=False)

    #分析命令行参数
    (options, args) = parser.parse_args()

    #检查错误
    print(options)
    if options.task == "error":
        print("请选择任务")
        sys.exit(1)

    #得到如何往文件里面写的格式
    task_function = get_task_function(options.task)

    #得到注册文件
    qconf = config("../../conf/question.conf")

    #得到将要写入的文件名
    extract_file = get_extract_file(options.task, qconf)

    #进行抽取
    store = None
    if options.store == True:
        store = qconf["filter_qa"]
    extract(qconf["car_pos"],
            extract_file,
            task_function,
            min_answer_count=10,
            pass_filter=word_counts_filter,
            store_file=store)
예제 #4
0
def main():
    """
    主函数,-t表示什么任务
    e.g. ./read_raw_data.py -t extract_title
         ./read_raw_data.py -t extract_title_nbest
    """
    
    parser = OptionParser()  
    parser.add_option("-t", "--task",dest="task",default="error",help="你需要选择哪个任务")
    parser.add_option("-s", "--store",dest="store",action="store_true",help="选择存储与否",default=False)


    #分析命令行参数
    (options, args) = parser.parse_args()

    #检查错误
    print(options)
    if options.task == "error":
        print("请选择任务")
        sys.exit(1)

    #得到如何往文件里面写的格式
    task_function = get_task_function(options.task)

    #得到注册文件
    qconf = config("../../conf/question.conf")

    #得到将要写入的文件名
    extract_file = get_extract_file(options.task,qconf)

    
    #进行抽取
    store = None
    if options.store == True:
        store = qconf["filter_qa"]
    extract(qconf["car_pos"],extract_file,task_function,min_answer_count=10,pass_filter=word_counts_filter,store_file=store)
예제 #5
0
#!/usr/bin/python3
#coding=utf-8

import sys
import pickle

from question_table import question_table

sys.path.append("..")
import insummer
from insummer.read_conf import config
from insummer.util import NLP
from insummer.query_expansion.entity_finder import NgramEntityFinder

#获得两个问题集的路径信息,并读取
ques_conf = config('../../conf/question.conf')
filter_path = ques_conf['filter_qa']
duc_path = ques_conf['duc_question']

fil_spath = ques_conf['filter_statistic']
duc_spath = ques_conf['duc_statistic']

nlp = NLP()

#获得两个语料的问题集
finfile = open(filter_path, 'rb')
fil_data = pickle.load(finfile)
dinfile = open(duc_path, 'rb')
duc_data = pickle.load(dinfile)

예제 #6
0
#!/usr/bin/python3
#coding=utf-8

'''
这个主要是将抽取的语料与duc语料的整体统计特征做一个直观的输出比较,
整体代码糙的不行,全赖问题结构固定,先这么将就着看吧。
'''

import pickle
import sys

from question_table import question_table
sys.path.append('..')
from insummer.read_conf import config

question_conf = config('../../conf/question.conf')

fil_path = question_conf['filter_statistic']
duc_path = question_conf['duc_statistic']

infile = open(fil_path,'rb')
fil_table = pickle.load(infile)

infile = open(duc_path,'rb')
duc_table = pickle.load(infile)

def get_total_avg(duc_list,isavg,nq):
    qa_total = 0
    entitle_total = 0
    wdtitle_total = 0
    enanser_total = 0
예제 #7
0
import sys
sys.path.append("..")
import insummer
from insummer.read_conf import config
from insummer.knowledge_base import concept_tool
from insummer.knowledge_base.relation import relation_tool

import pickle

from abc import ABCMeta, abstractmethod

#others
import csv
from optparse import OptionParser

conf = config("../../conf/cn_data.conf")

data_pos = conf["csv_pos"]

part = [i for i in range(0, 8)]

cn_tool = concept_tool()
rel_tool = relation_tool()


#得到第i份part的名字
def get_ipart_name(i):
    return "%spart_0%s.csv" % (data_pos, part[i])


def get_ipart_handler(i):
예제 #8
0
'''
这个文件的主要作用是统计relation的数据
'''
import sys
sys.path.append("..")
import insummer
from insummer.read_conf import config
from insummer.knowledge_base import concept_tool
from insummer.knowledge_base.relation import relation_tool


#others
import csv

conf = config("../../conf/cn_data.conf")

data_pos = conf["csv_pos"]

part = [i for i in range(0,8)]

cp_tool = concept_tool()
rel_tool = relation_tool()

#得到第i份part的名字
def get_ipart_name(i):
    return "%spart_0%s.csv"%(data_pos,part[i])

def get_ipart_handler(i):
    assert int(i) in part
    
예제 #9
0
def test1():
    conf = config("../../conf/question.conf")

    f = open(conf["computer_pos"])

    indx = 0

    title = ""
    nbest = []
    answer_count = -1
    author = ""

    questions = []

    line = f.readline()

    question_indx = 0

    while len(line) > 0:

        #先去除line两边的空格和最后结尾的逗号
        line = line.strip()

        if line[-1] == ',':
            line = line[:-1]

        #把json都装载进来
        try:
            line_json = json.loads(line)
        except:
            print(line)
            sys.exit(1)

        #判断是answer还是question
        if "answercount" in line_json:
            #是问题

            #先把上一个问题的存了
            #content为空,best空
            #如果nbest为空,说明还没有人回答过, 那么不处理
            if len(nbest) > 0:
                m_question = Question(title, "", "", nbest, author,
                                      answer_count)
                #m_question.print()
                #questions.append(m_question)
                question_indx += 1
                if question_indx % 100 == 0:
                    print("question indx", question_indx)

            #重新计数
            if len(line_json["answercount"].strip()) > 0:
                answer_count = int(line_json["answercount"])
            else:
                answer_count = 0
            title, nbest, author = "", [], ""

            #现在开始重新存
            title = line_json["subject"]

        elif "content" in line_json:
            content = line_json["content"]
            support = int(line_json["supportnum"])
            oppose = int(line_json["opposenum"])
            ans_author = line_json["answeruser"]

            emp_answer = Answer(content, support, oppose, ans_author)

            nbest.append(emp_answer)

        else:
            print("error")
            sys.exit(1)

        indx += 1
        line = f.readline()
        if indx % 1000 == 0:
            print("indx", indx)

    m_question = Question(title, "", "", nbest, author, answer_count)
예제 #10
0
#!/usr/bin/python3
'''
这个文件主要测试载入数据
'''

import sys
sys.path.append("..")
import insummer
from insummer.read_conf import config

import pickle

qconf = config("../../conf/question.conf")

def get_data():
    
    data_dir = qconf["filter_qa"]
    f = open(data_dir,'rb')
    data = pickle.load(f)
    
    return data

def get_duc():
    duc_dir = qconf['duc_question']
    f = open(duc_dir,'rb')
    data = pickle.load(f)

    return data


if __name__ == '__main__':
예제 #11
0
def test1():
    conf = config("../../conf/question.conf")

    f = open(conf["computer_pos"])

    indx = 0

    title = ""
    nbest = []
    answer_count = -1
    author = ""

    questions = []
    
    line = f.readline()

    question_indx = 0
    
    while len(line) > 0 :

        #先去除line两边的空格和最后结尾的逗号
        line = line.strip()

        if line[-1] == ',':
            line = line[:-1]
            
        #把json都装载进来
        try:    
           line_json = json.loads(line)
        except:
            print(line)
            sys.exit(1)

        #判断是answer还是question
        if "answercount" in line_json:
            #是问题

            #先把上一个问题的存了
            #content为空,best空
            #如果nbest为空,说明还没有人回答过, 那么不处理
            if len(nbest) > 0:
                m_question = Question(title,"","",nbest,author,answer_count)
                #m_question.print()
                #questions.append(m_question)
                question_indx += 1
                if question_indx % 100 == 0:
                    print("question indx",question_indx)

            #重新计数
            if len(line_json["answercount"].strip()) > 0:
                answer_count = int(line_json["answercount"])
            else:
                answer_count = 0
            title,nbest,author = "",[],""

            #现在开始重新存
            title = line_json["subject"]
            

        elif "content" in line_json:
            content = line_json["content"]
            support = int(line_json["supportnum"])
            oppose = int(line_json["opposenum"])
            ans_author = line_json["answeruser"]
            
            emp_answer = Answer(content,support,oppose,ans_author)

            nbest.append(emp_answer)
            
        else:
            print("error")
            sys.exit(1)
            
        indx += 1
        line = f.readline()
        if indx %1000 == 0:
            print("indx",indx)

    m_question = Question(title,"","",nbest,author,answer_count)