示例#1
0
def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready')
    for fileP in filePathList:
        html = readTXT(fileP)
        eid = getID(html)
        if eid > 0:
            topicID = extractTopic(eid, html)
            if len(topicID) > 0:
                insertTopic(eid, topicID, fileP)
            else:
                try:
                    print('empty: ' + fileP)
                    updateSQL = 'update dlurl1 set status=2 where id=' + str(
                        eid)
                    cur.execute(updateSQL)  #标记已抽取
                    conn.commit()
                except Exception:
                    print('error: ' + updateSQL)
        else:
            print('eid error: ' + fileP)
        #break#只运行一次
    cur.close()
    conn.close()
示例#2
0
def mainFunction():
    #
    filesPath = readFiles(files_path)

    for fileP in filesPath:
        eid = fileP.split('inst_')[1].replace('.csv', '')
        institList = readList(fileP)

        for inst in institList:
            if len(inst) < 4:
                break
            inst = cleanInstit(inst)
            insertSQL = 'insert into experience1 (eid,institution) values(' + str(
                eid) + ',"' + inst + '")'
            try:
                cur.execute(insertSQL)
                conn.commit()
                #print('complete:' +str(eid))
            except Exception:
                print('insert error' + str(eid))
        print('complete:' + str(eid))
        cur.execute('update dlurl1 set tem=5 where id=' + str(eid))
        conn.commit()
        #break #only once

    cur.close()
    conn.close()
def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready')
    for fileP in filePathList:
        html = readTXT(fileP)
        eid = getID(html)
        if eid >0:
            instit = extractInstitut(html)
            if len(instit)>0:
                insertInstitution(eid,instit,fileP)
            else:
                try:
                    print('empty: '+fileP)
                    updateSQL = 'update dlurl1 set tem=5 where id='+str(eid)
                    cur.execute(updateSQL)#标记已抽取
                    conn.commit()
                except Exception:
                    print('error: '+updateSQL)
        else:
            print('eid error: '+fileP)
        	
        #break#只运行一次

    cur.close();conn.close();
def insertNull():
    #
    '''
    fileList = readFiles('E:/Code/Pickle/samesingle/same')
    for fp in fileList:
        sameList = readSeriz(fp)
        for sl in sameList:
            selectResult = getResult('select paperid from paper where id='+str(sl[1]),cur)
            updateSQL = 'update paper set paperid='+str(selectResult[0]['paperid'])+' where id='+str(sl[0])
            print(updateSQL)
            cur.execute(updateSQL)
            conn.commit()
            print('completed: '+str(selectResult[0]['paperid'])+' '+str(sl[0]))
        print('update: '+fp)

    '''
    maxPID = 4263215
    fileList = readFiles('E:/Code/Pickle/samesingle/single')
    for fp in fileList:
        print('update: ' + fp)
        single = readSeriz(fp)
        for s in single:
            maxPID += 1
            updateSQL = 'update paper set paperid=' + str(
                maxPID) + ' where id=' + str(s)
            cur.execute(updateSQL)
            conn.commit()
            print('now is ' + str(maxPID))
示例#5
0
def mainFunction():
    conn, cur = getCursor()
    filePathList = readFiles(files_path)
    for fileP in filePathList:
        insertPaperSQL(fileP)
        #break#只运行一次

    cur.close()
    conn.close()
def compareNull():

    yearListNull = readSeriz(yearList_pickle_null)
    yearList = readSeriz(yearList_pickle)
    for fp in readFiles(nullDict):
        sameList = []
        single = []
        print('now begin: ' + str(fp))
        nullYearTitle = readSeriz(fp)
        yidx_null, nidx_null = extractYearTitle(fp)
        if yidx_null > len(yearListNull):
            print('error!!!!!!!!!!????')
            continue
        year = yearListNull[yidx_null]
        if year in yearList:
            yidx = yearList.index(year)
        else:
            print('error!!!!!!!!!!')
            continue

        path = idyeartitle_path + str(yidx) + '_' + str(nidx_null) + '.pickle'
        yearTitle = readSeriz(path)

        if len(yearTitle) < 1:
            for i in range(len(nullYearTitle)):
                single.append(nullYearTitle[i][0])

        for i in range(len(nullYearTitle)):
            flag = False
            for j in range(len(yearTitle)):
                if nullYearTitle[i][2] == yearTitle[j][2]:
                    sameList.append([nullYearTitle[i][0], yearTitle[j][0]])
                    flag = True
                    continue
            if flag == False:
                single.append(nullYearTitle[i][0])

        sameList_path = sameList_pickle + str(yidx) + '_' + str(
            nidx_null) + '.pickle'
        single_path = single_pickle + str(yidx) + '_' + str(
            nidx_null) + '.pickle'
        constructSeriz(sameList_path, sameList)
        constructSeriz(single_path, single)
示例#7
0
def mainFunction():
    filePathList = readFiles(dict_path)
    for fileP in filePathList:
        eid = fileP.split('topicsupply_')[1].replace('.csv', '')
        id = cleanID(eid)
        insertPaperSQL(id, fileP)