Пример #1
0
def readTSV(filename, name):
    dict = {}
    file = open(filename, "r", encoding="utf8")
    # conn=get_conn()
    # cur=conn.cursor()
    # cur.execute("select *  from articles_"+name+" ")
    # datarows=cur.fetchall()
    # count=len(datarows)
    conn = get_conn()
    cur = conn.cursor()
    # 按行读文件
    alllines = file.readlines()
    for line in alllines:
        list = line.split("\t")
        dict["id"] = list[0]
        # print (dict["id"])
        dict["url"] = list[1]
        # print (dict["url"])
        dict["title"] = list[2]
        dict["content"] = list[3]
        # dict["count"]=count
        insertArticleInDB(dict, name, conn, cur)
        # count+=1
    cur.close()
    conn.close()
Пример #2
0
def combineMeniton(tablename):
    conn = get_conn()
    cur = conn.cursor()
    sql = "select * from mention1_"+tablename+" as A, mention2_"+tablename+" as B " +\
          "WHERE A.doc_id=B.doc_id and A.sentence_index=B.sentence_index"
    #print(sql)
    #return
    cur.execute(sql)
    datarows = cur.fetchall()
    taskNum = len(datarows)
    string = "%s:N=%d|" % ("candidate", taskNum)
    pbar = pb.ProgressBar(
        widgets=[string, pb.Percentage(),
                 pb.Bar(), pb.ETA()], maxval=taskNum)
    pbar.start()
    num_completed = 0
    for data in datarows:
        mention_id, mention_text, doc_id, sentence_index, begin_index, end_index = data[:
                                                                                        6]
        mention_id2, mention_text2, doc_id2, sentence_index2, begin_index2, end_index2 = data[
            6:]
        # 词距离过长
        if begin_index2 - end_index > 25 or end_index2 - begin_index > 25:
            continue
        insertCandidationDB(mention_id, mention_text, mention_id2,
                            mention_text2, tablename, conn, cur)
        num_completed += 1
        pbar.update(num_completed)
    pbar.finish()
    cur.close()
    conn.close()
Пример #3
0
def selectMentionDB(filePath, index, tablename, mode = "dict"):
    conn = get_conn()
    cur = conn.cursor()
    sql = "select id,title,content from articles_" + tablename +" where id not in (select distinct doc_id from mention"+index+"_" + tablename+")"
    cur.execute(sql)
    datarows = cur.fetchall()
    taskNum = len(datarows)
    string = "%s:N=%d|" % ("findNer", taskNum)
    pbar = pb.ProgressBar(widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum)
    pbar.start()
    num_completed = 0
    data.append(string + "|" + "已完成" + str((num_completed / taskNum) * 100) + "%")
    nerData = loadNERFile(filePath, mode)
    for row in datarows:
        docId,title,content = row
        keyList = getKeyListPerDoc(title,content,nerData,mode)
        # 按照文章遍历,每次获取一篇文章下所有句子,统一处理wordList
        cur.execute("select sentence_index,sentence_text,tokens from sentences_" + tablename+" where doc_id = '%s'"%docId)
        senrows = cur.fetchall()
        for sen in senrows:
            sentence_index, sentence_text, tokens = sen
            senLength = len(sentence_text)
            if senLength > 100 or senLength < 10: continue
            getMentionFromSen(docId,sentence_index,sentence_text,tokens,keyList,index,tablename,conn,cur)
        num_completed += 1
        pbar.update(num_completed)
    pbar.finish()
    cur.close()
    conn.close()
Пример #4
0
def selectMentionDB2(filePath,index,tablename, mode = "dict"):
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("select doc_id,sentence_index,sentence_text,tokens from sentences_"+tablename)
    data.append("select doc_id,sentence_index,sentence_text,tokens from sentences_"+tablename)
    datarows=cur.fetchall()
    taskNum = len(datarows)
    string = "%s:N=%d|" % ("findNer", taskNum)
    pbar = pb.ProgressBar(widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum)
    pbar.start()
    num_completed = 0
    nerData = loadNERFile(filePath, mode)
    for row in datarows:
        doc_id, sentence_index,sentence_text, tokens=row
        senLength = len(sentence_text)
        #筛选句子
        if senLength > 100 or senLength < 10: continue

        if mode == "dict":
            mentionDict(doc_id,sentence_index,sentence_text,tokens,nerData,index,tablename,conn,cur)
        #elif mode == "type":
        #    mentionType(doc_id, sentence_index, sentence_text, tokens, nerData, index, tablename, conn, cur)
        num_completed += 1
        pbar.update(num_completed)
    pbar.finish()
    cur.close()
    conn.close()
Пример #5
0
def selectfeatureMentionDB(tablename):
    conn = get_conn()
    cur = conn.cursor()
    # 从候选集里提取两词特征

    sql = """select A.id,A.p1_id,A.p2_id,B.begin_index,B.end_index,C.begin_index, C.end_index,D.tokens,D.lemmas,D.pos_tags,D.ner_tags,D.dep_types,D.dep_tokens 
        from candidate_"""+tablename+""" as A, mention1_"""+tablename+""" as B,mention2_"""+tablename+""" as C,sentences_"""+tablename+""" as D
        where A.p1_id=B.mention_id and A.p2_id=C.mention_id and B.doc_id=D.doc_id
        and B.sentence_index=D.sentence_index and A.id not in (select distinct cid from feature_"""+tablename+""")"""
    cur.execute(sql)
    data.append("实体关系选取中......")
    data.append(sql)
    datarows=cur.fetchall()
    taskNum = len(datarows)
    string = "%s:N=%d|" % ("featureGet", taskNum)
    pbar = pb.ProgressBar(widgets=[string, pb.Percentage(), pb.Bar(), pb.ETA()], maxval=taskNum)
    pbar.start()
    num_completed = 0
    #flash(string + "|" + "已完成" + str((num_completed / taskNum) * 100) + "%")
    for row in datarows:
        featureMention(row,tablename,conn,cur)
        num_completed += 1
        pbar.update(num_completed)
    pbar.finish()
    cur.close()
    conn.close()
Пример #6
0
def insertCandidationAll(tablename):
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("TRUNCATE TABLE candidate_" + tablename)  # 清空原表
    cur.execute("select setval( 'candidate_" + tablename +
                "_id_seq',1,false);")  #重置自增id为1
    sql = "insert into candidate_" + tablename + " (p1_id, p1_name, p2_id, p2_name) " +\
    "select p1_id, p1_name, p2_id, p2_name from ("+\
    "select A.mention_id as p1_id, A.mention_text as p1_name, A.doc_id, A.sentence_index, A.begin_index,A.end_index,"+\
    "B.mention_id as p2_id, B.mention_text as p2_name, B.doc_id, B.sentence_index, B.begin_index,B.end_index "+\
    "from mention1_" + tablename + " as A, mention2_" + tablename + " as B "+\
    "WHERE A.doc_id=B.doc_id and A.sentence_index=B.sentence_index "+\
    "and (B.begin_index - A.end_index <= 25 or B.end_index - A.begin_index <= 25)" +\
    ") as C"
    data.append(sql)
    cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()
Пример #7
0
def getSensNLP(tablename, nlp):
    conn = get_conn()
    cur = conn.cursor()
    sql = "select doc_id, sen_id,max(txt),max(tokens) from ( "+\
    "select D.doc_id as doc_id, D.sentence_index as sen_id, D.sentence_text as txt, D.tokens as tokens "+\
    "from candidate_"+tablename+" as A, mention1_"+tablename+" as B,mention2_"+tablename+" as C,sentences_"+tablename+" as D "+\
    " where A.p1_id=B.mention_id and A.p2_id=C.mention_id and B.doc_id=D.doc_id "+\
    "and B.sentence_index=D.sentence_index and D.flag = 0 "+\
    ") as E group by doc_id,sen_id"
    data.append("实体关系选取中......")
    data.append(sql)
    cur.execute(sql)
    rows = cur.fetchall()
    taskNum = len(rows)
    string = "%s:N=%d|" % ("senNLP", taskNum)
    pbar = pb.ProgressBar(
        widgets=[string, pb.Percentage(),
                 pb.Bar(), pb.ETA()], maxval=taskNum)
    pbar.start()
    num_completed = 0
    #flash(string + "|" + "已完成" + str((num_completed / taskNum) * 100) + "%")
    for row in rows:
        docid, sen_id, sen_txt, tokens = row
        try:
            print("processing doc id:" + docid)
            buildSentenceDB(parsertext(sen_txt, tokens, docid, sen_id, nlp),
                            tablename, conn, cur)
            #buildSentenceDB(parsertext(tokens, docid, sen_id, nlp), tablename, conn, cur)
        except Exception as e:
            print("1", e)
            continue

        num_completed += 1
        pbar.update(num_completed)
    pbar.finish()
    cur.close()
    conn.close()
Пример #8
0
def createAllDB(name):
    #连接数据库
    conn = get_conn()
    # 建立cursor对象
    cur = conn.cursor()
    # id 文章id惟一,url文章链接,title文章标题,content文章内容,source文章来源,主键id
    sql = """CREATE TABLE if not exists articles_""" + name + """ (
                id text COLLATE "default",
                url text COLLATE "default",
                title text COLLATE "default",
                content text COLLATE "default"
            )
            WITH (OIDS=FALSE);
            ALTER TABLE articles_""" + name + """ OWNER TO postgres"""
    #执行sql命令,创建一个新表
    cur.execute(sql)
    #print(sql)

    #提交数据改变
    conn.commit()
    data.append(sql)
    data.append("表articles_" + name + "创建完成")
    data.append("=========================================")
    # doc_id文章id,sentence_index句号,sentence_text句子内容,tokens分词,lemmas原词,pos_tags词性标记,ner_tags实体识别标记,doc_offsets所在文章第几个词,dep_types文法依赖,dep_tokens文法依赖树
    sql = """CREATE TABLE if not exists sentences_""" + name + """ (
                id SERIAL primary key,
                doc_id text COLLATE "default",
                sentence_index int4,
                sentence_text text COLLATE "default",
                tokens text[] COLLATE "default",
                lemmas text[] COLLATE "default",
                pos_tags text[] COLLATE "default",
                ner_tags text[] COLLATE "default",
                doc_offsets int4[],
                dep_types text[] COLLATE "default",
                dep_tokens int4[],
                flag int4
            )
            WITH (OIDS=FALSE);
            ALTER TABLE sentences_""" + name + """ OWNER TO postgres"""
    cur.execute(sql)
    data.append(sql)
    data.append("表sentences_" + name + "创建完成")
    data.append("=========================================")
    conn.commit()
    # mention_id实体1id,mention_text实体内容,doc_id文章id,sentence_index句号,begin_index实体1句中起始位置,end_index实体1句中结束位置
    sql = """CREATE TABLE if not exists mention1_""" + name + """ (
                   mention_id text COLLATE "default",
                   mention_text text COLLATE "default",
                   doc_id text COLLATE "default",
                   sentence_index int4,
                   begin_index int4,
                   end_index int4
               )
               WITH (OIDS=FALSE);
               ALTER TABLE mention1_""" + name + """ OWNER TO postgres"""
    cur.execute(sql)
    data.append(sql)
    data.append("表mention1_" + name + "创建完成")
    data.append("=========================================")
    conn.commit()
    # mention_id实体2id,mention_text实体内容,doc_id文章id,sentence_index句号,begin_index实体2句中起始位置,end_index实体2句中结束位置
    sql = """CREATE TABLE if not exists mention2_""" + name + """ (
                   mention_id text COLLATE "default",
                   mention_text text COLLATE "default",
                   doc_id text COLLATE "default",
                   sentence_index int4,
                   begin_index int4,
                   end_index int4
               )
               WITH (OIDS=FALSE);
               ALTER TABLE mention2_""" + name + """ OWNER TO postgres"""
    cur.execute(sql)
    data.append(sql)
    data.append("表mention2_" + name + "创建完成")
    data.append("=========================================")
    conn.commit()
    # p1_id实体1id,p1_name实体1内容,p2_id实体2id,p2_name实体2内容,关系对候选表
    sql = """CREATE TABLE if not exists candidate_""" + name + """ (
                    id SERIAL primary key,
                    p1_id text COLLATE "default",
                    p1_name text COLLATE "default",
                    p2_id text COLLATE "default",
                    p2_name text COLLATE "default"
                )
                WITH (OIDS=FALSE);
                ALTER TABLE candidate_""" + name + """ OWNER TO postgres"""
    cur.execute(sql)
    data.append(sql)
    data.append("表candidate_" + name + "创建完成")
    data.append("=========================================")
    conn.commit()
    # p1_id实体1id,p2_id实体2id,feature关系对特征
    sql = """CREATE TABLE if not exists feature_""" + name + """ (
                    cid int NOT NULL,
                    feature text COLLATE "default"
                )
                WITH (OIDS=FALSE);
                ALTER TABLE feature_""" + name + """ OWNER TO postgres"""
    cur.execute(sql)
    data.append(sql)
    data.append("表feature_" + name + "创建完成")
    data.append("=========================================")
    conn.commit()
    conn.close()