def prepareTopic(): subL1 = 'E:/Code/experience/pickle/subPickle 2012.pickle' subL2 = 'E:/Code/experience/pickle/subPickle 2013.pickle' subL3 = 'E:/Code/experience/pickle/subPickle 2014.pickle' subL4 = 'E:/Code/experience/pickle/subPickle 2015.pickle' #stopword = readStopWord() #准备保留词表数量 sub1 = readSeriz(subL1) sub2 = readSeriz(subL2) sub3 = readSeriz(subL3) sub4 = readSeriz(subL4) sub = sub1 + sub2 + sub3 + sub4 sub = list(set(sub)) topicWord = [] #停用词 for s in sub: s = s.replace('\ufeff', '').replace('/', ' ') for w in s.split(): if not w in stopword and len(w) > 2: newTopic = '' nw = wn.morphy(w) #词干化 try: newTopic += (nw + '') #变成新的 except Exception: newTopic += (w + '') #不变 topicWord.append(newTopic) newTopic = list(set(newTopic)) #print(topicWord) constructSeriz(word_pickle, topicWord) #序列化
def buildCoauthorByYear(year): #coauthornet xid,yid,score paperNetYear = readSeriz(paperNetYear_pickle) idList = readSeriz(idList_pickle) coauthorNet = readSeriz(expertNet_pickle)#这相当于初始化模板 ''' #---------------------------------------- #---不考虑数据库 start coauthorGraph = nx.Graph() paperGraph = readSeriz(paperGraph_pickle) for tup in paperGraph.edges(): coauthorList = paperGraph[tup[0]][tup[1]]['coauthoryear'] coauthorGraph.add_edge(*(tup[0],tup[1]),score=calculateCoauthor(coauthorList,tagYear)) constructSeriz(coauthorGraph_pickle,coauthorGraph) #---不考虑数据库 end #---------------------------------------- ''' for i in range(len(idList)): xid = str(idList[i]) for j in range((i+1),len(idList)): yid = str(idList[j]) #yearList = getResultList('select year where xid='+xid+', yid='+yid+', year>'+str(year),'year',cur) coauthorList = paperNetYear[i][j] #这里就是具体计算了 if len(yearList)==0: continue score = calculateCoauthor(coauthorList) #xid yid 已经转成str insertSQL = 'insert into coauthornet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() coauthorNet[i][j] = score #这里也做一个记录 coauthorNet_pickle = '' #这个地址要重写,因为需要加入year这个变量 constructSeriz(coauthorNet_pickle,coauthorNet)
def buildTopic(): # colleaguenet xid,yid,score topicNet = readSeriz(expertNet_pickle) topicNetMore = readSeriz(expertNet_pickle) topicGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #other还没有设置,相当于tid xResult = getResult('select other from topic where eid='+str(idList[i]),cur) #需要有个other list 建立一个字典 key是other,num是内容 xDict = {} for x in xResult: xDict[x['other']] = x['num'] xID = xDict.keys() totalSet.append((xDict,xID)) for i in range(len(idList)): for j in range((i+1),len(idList)): topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(topicList)>0: #全部插入数据库中 score = calculateTopic(topicList) #xid yid 已经转成str insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() topicNet[i][j] = score #这里也做一个记录 topicGraph(*(i,j),similar=score) topicNetMore[i][j] = topicList constructSeriz(topicNet_pickle,topicNet) constructSeriz(topicGraph_pickle,topicGraph) constructSeriz(topicNetMore_pickle,topicNetMore)
def buidPaperNet(): # #papernet id xid yid perid year paperNet = readSeriz(expertNet_pickle)#这相当于初始化模板 paperNetYear = readSeriz(expertNet_pickle)#这相当于初始化模板 paperGraph=nx.Graph() #初始化一个图 #预读 totalSet = [] for i in range(len(idList)): xResult = getResult('select id,time from paper where eid='+str(),cur) #需要有个pid list 建立一个字典 key是id,time是内容 xDict = {} for x in xResult: xDict[x['id']] = x['time'] xID = xDict.keys() totalSet.append((xDict,xID)) #预存一次表, 方便用时间这个条件进行控制 for i in range(len(idList)): for j in range((i+1),len(idList)): coauthorList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(coauthorList) >0: #全部插入数据库中 for cid in coauthorList: year = totalSet[i][0][cid] insertSQL = 'insert into papernet (xid,yid,perid,year) values('+str(idList[i])+','+str(idList[j])+','+str(cid)+','+str(year)+')' paperNet[i][j] = len(coauthorList) paperNetYear[i][j] = coauthorList paperGraph.add_edge(*(i,j),coauthoryear=coauthorList) #这个增加一条边 constructSeriz(paperNet_pickle,paperNet) constructSeriz(paperNetYear_pickle,paperNetYear) constructSeriz(paperGraph_pickle,paperGraph)
def prepareDistanceMetrx(): # 计算合作者之间的关联度 #import os;os.chdir('e:/Code/Python');import geng_coi;geng_coi.prepareDistanceMetrx() PART = 5 totalEID = readSeriz(totalEID_pickle) totalEID = list(set(totalEID)) lenEID = len(totalEID) #distanceMetrx = [[0 for i in range(lenEID)] for j in range(lenEID)] limit = int(lenEID / 10) print('len is ' + str(lenEID)) distanceMetrx_pickle = dictPath + 'distanceMetrx0.pickle' distanceMetrx = readSeriz(distanceMetrx_pickle) #distanceMetrx_pickle_new = dictPath + 'distanceMetrx'+str(1)+'.pickle' distanceMetrx_pickle_new = dictPath + 'distanceMetrx' + str( PART) + '.pickle' #print('here0') #for i in range(lenEID-1): for i in range((limit * (PART - 1)), lenEID - 1): #print('i is '+str(i)) py1 = getResultList( 'select paperid from tmp_paper where eid = ' + str(totalEID[i]), 'paperid', cur) set1 = set(py1) #print(set1) if i > (limit * PART): #print('???') break distanceMetrx[i][i] = 0 #print('here1') for j in range(i + 1, lenEID): py2 = getResultList( 'select paperid from tmp_paper where eid = ' + str(totalEID[j]), 'paperid', cur) set2 = set(py2) sameset = list(set1 & set2) value = 0 if len(sameset) == 0: #print('here2') distanceMetrx[i][j] = 0 #做完标准化后再附成M distanceMetrx[j][i] = 0 else: #print('here3') value = measureCoauthor(py1, py2, sameset) distanceMetrx[i][j] = value distanceMetrx[j][i] = value if j % 100 == 0: constructSeriz(distanceMetrx_pickle_new, distanceMetrx) print('completed: ' + str(i) + ', ' + str(j) + ' value: ' + str(value)) #break #break constructSeriz(distanceMetrx_pickle_new, distanceMetrx) constructSeriz(distanceMetrx_pickle_new, distanceMetrx)
def insertNull(): # ''' fileList = readFiles('E:/Code/Pickle/samesingle/same') for fp in fileList: sameList = readSeriz(fp) for sl in sameList: selectResult = getResult('select paperid from paper where id='+str(sl[1]),cur) updateSQL = 'update paper set paperid='+str(selectResult[0]['paperid'])+' where id='+str(sl[0]) print(updateSQL) cur.execute(updateSQL) conn.commit() print('completed: '+str(selectResult[0]['paperid'])+' '+str(sl[0])) print('update: '+fp) ''' maxPID = 4263215 fileList = readFiles('E:/Code/Pickle/samesingle/single') for fp in fileList: print('update: ' + fp) single = readSeriz(fp) for s in single: maxPID += 1 updateSQL = 'update paper set paperid=' + str( maxPID) + ' where id=' + str(s) cur.execute(updateSQL) conn.commit() print('now is ' + str(maxPID))
def prepareNULL(): # nullDOI = readSeriz(nullDOI_pickle) yearTitle = [[[] for j in range(27)] for i in range(60)] yearList = [] num = 0 for nid in nullDOI: #print(str(nid)) getR = getResult('select time,title from paper where id=' + str(nid), cur) sr = getR[0] if not (len(sr['title']) > 0 and sr['time'] > 0): continue if sr['time'] in yearList: yidx = yearList.index(sr['time']) else: yearList.append(sr['time']) yidx = yearList.index(sr['time']) nidx = assignNum(sr['title']) yearTitle[yidx][nidx].append((nid, sr['time'], sr['title'])) print('now is ' + str(num) + ' id is ' + str(nid)) num += 1 print('begin saving') constructSeriz(yearList_pickle_null, yearList) for i in range(len(yearList)): for j in range(27): path = idyeartitle_path_null + str(i) + '_' + str(j) + '.pickle' if len(yearTitle[i][j]) > 0: constructSeriz(path, yearTitle[i][j])
def compareNull(): yearListNull = readSeriz(yearList_pickle_null) yearList = readSeriz(yearList_pickle) for fp in readFiles(nullDict): sameList = [] single = [] print('now begin: ' + str(fp)) nullYearTitle = readSeriz(fp) yidx_null, nidx_null = extractYearTitle(fp) if yidx_null > len(yearListNull): print('error!!!!!!!!!!????') continue year = yearListNull[yidx_null] if year in yearList: yidx = yearList.index(year) else: print('error!!!!!!!!!!') continue path = idyeartitle_path + str(yidx) + '_' + str(nidx_null) + '.pickle' yearTitle = readSeriz(path) if len(yearTitle) < 1: for i in range(len(nullYearTitle)): single.append(nullYearTitle[i][0]) for i in range(len(nullYearTitle)): flag = False for j in range(len(yearTitle)): if nullYearTitle[i][2] == yearTitle[j][2]: sameList.append([nullYearTitle[i][0], yearTitle[j][0]]) flag = True continue if flag == False: single.append(nullYearTitle[i][0]) sameList_path = sameList_pickle + str(yidx) + '_' + str( nidx_null) + '.pickle' single_path = single_pickle + str(yidx) + '_' + str( nidx_null) + '.pickle' constructSeriz(sameList_path, sameList) constructSeriz(single_path, single)
def insertGN(): # GN = readSeriz(GN_pickle_path) for gid in GN.keys(): yidList = GN[gid] for yid in yidList: insertSQL = 'insert into name2name (gid,yid) values ('+str(gid)+','+str(yid)+')' cur.execute(insertSQL) conn.commit() print('now is '+str(gid))
def buildFinalGraph(): #对两个图进行加和 #coauthorGraph = readSeriz(coauthorGraph_pickle) #institutionGraph = readSeriz(institutionGraph_pickle) finalLen = len(coauthorNet) coauthorNet = readSeriz(coauthorNet_pickle) institutionNet = readSeriz(institutionNet_pickle) finalNet = readSeriz(expertNet_pickle) #初始化一个模板 finalGraph = nx.Graph() for i in range(finalLen): for j in range((i+1),finalLen): if i==j: continue finalNet[i][j] = coauthorNet[i][j] + institutionNet[i][j] finalNet[j][i] = finalNet[i][j] if finalNet[i][j] != 0: finalGraph.add_edge(*(i,j),score=finalNet[i][j]) constructSeriz(finalGraph_pickle,finalGraph)
def measureCOI(): #计算COI矩阵 reviewerDict = readSeriz(reviewerDict_pickle) personDict = readSeriz(personDict_pickle) reviewer_ID = readSeriz(reviewer_ID_pickle) totalEID = readSeriz(totalEID_pickle) paperDict = readSeriz(paperDict_pickle) shortDistanceMetrx = readSeriz(shortDistanceMetrx_pickle) paperList = list(paperDict.keys()) reviewerList = list(reviewerDict.keys()) COI = [[0 for y in range(len(reviewerList))] for x in range(len(paperList))] for title in paperDict.keys(): i = paperList.index(title) for rName in reviewerDict.keys(): j = reviewerList.index(rName) rid = reviewer_ID[rName] idy = totalEID.index(rid) coiList = [] for name in paperDict[title]: eid = personDict[name] idx = totalEID.index(eid) coi = shortDistanceMetrx[idx][idy] coiList.append(coi) mcoi = 1 - min(coiList) #这个COI就是这个评审对这个文章的COI COI[i][j] = mcoi constructSeriz(COI_pickle, COI)
def mainFunction(): # #获得要查询的列表 browser = webdriver.Chrome() paperDict = readSeriz(paperDict_pickle) for title in paperDict.keys(): try: searchPage(browser, title) print('completed: ' + title) except Exception: print('error: ' + title) #break browser.quit()
def insertGN_Amount(): # GN_Amount = readSeriz(GN_Amount_pickle_path) #print(str(GN)) i = 0 for tp in list(GN_Amount.keys()): #print(str(tp)) amount = GN_Amount[tp] insertSQL = 'insert into namegroup (gid,amount) values ('+str(tp)+','+str(amount)+')' cur.execute(insertSQL) conn.commit() print('completed '+str(tp)+','+str(amount)) print('now is '+str(i)) i+=1
def flody(): #运行floyd 算最短距离 distanceMetrx = readSeriz(distanceMetrx_pickle) lenD = len(distanceMetrx) DSet = [] D0 = [[0 for i in range(lenD)] for j in range(lenD)] DSet.append(distanceMetrx) A0 = [[[] for i in range(lenD)] for j in range(lenD)] ASet = [] ASet.append(A0) for k in range(1, MAX_TIME): #从1开始,迭代次数 #初始化一个矩阵 DSet.append(D0) ASet.append(A0) for i in range(lenD): for j in range(lenD): if i == j: DSet[k][j][j] = 0 ASet[k][j][j] = 0 continue MIN_old = DSet[k - 1][i][j] for x in range(lenD): if (x == i) or (x == j): continue if (DSet[k - 1][i][x] >= MIN_old) or (DSet[k - 1][x][j] >= MIN_old): DSet[k][i][j] = DSet[k - 1][i][j] else: MIN_new = min(MIN_old, (DSet[k - 1][i][x] + DSet[k - 1][x][j])) DSet[k][i][j] = MIN_new if MIN_new < MIN_old: #记下x ASet[k][i][j] = [ ASet[k - 1][i][x], x, ASet[k - 1][x][j] ] else: ASet[k][i][j] = ASet[k - 1][i][j] MIN_old = MIN_new print('now is ' + str(k)) shortDistanceMetrx = DSet[k][:] if DSet[k] == DSet[k - 1]: break constructSeriz(shortDistanceMetrx_pickle, shortDistanceMetrx)
def insertSimilarity(): # similarity = readSeriz(similarity_pickle_path) #print(str(GN)) i = 0 for tp in similarity: #print(str(tp)) xid = tp[0] yid = tp[1] rate = tp[2] #tag = int(tp[2]*100) insertSQL = 'insert into namecompare (xid,yid,nameRate) values ('+str(xid)+','+str(yid)+','+str(rate)+')' cur.execute(insertSQL) conn.commit() print('completed '+str(xid)+','+str(yid)+','+str(rate)) print('now is '+str(i)) i+=1
def updateDOI(): idPaperid = readSeriz(idPaperid_pickle) ''' for ip in idPaperid: updateSQL = 'update paper set paperid='+str(ip[0])+' where id='+str(ip[1]) cur.execute(updateSQL) conn.commit() print('completed: '+str(ip[1])) #break ''' paperid = [] id = [] for ip in idPaperid: paperid.append(ip[0]) id.append(ip[1]) maxPid = max(paperid) print('max is ' + str(maxPid))
def indexEID(): # 根据author的institute对应dlurl1里面的id paperDict = readSeriz(paperDict_pickle) reviewerDict = readSeriz(reviewerDict_pickle) reviewer_ID = {} totalEID = [] personDict = {} #存放人 paperAuthorDict = {} #paper 部分 ------------ #if False: for title in paperDict.keys(): #title = '' authorsInstitu = paperDict[title] #print(authorsInstitu) paperAuthorDict[title] = [] for ai in authorsInstitu: name = ai[0] if name in personDict.keys(): continue Institu = ai[1] eidList = getResultList( 'select distinct eid from experience1 where eid in (select id from dlurl1 where name like "' + cleanName(name) + '") and institution like "' + Institu + '" limit 1', 'eid', cur) if len(eidList) > 0: personDict[name] = eidList[0] totalEID += eidList paperAuthorDict[title].append(name) print('comppleted: ' + name) else: eidList = getResultList( 'select id from dlurl1 where name like "' + cleanName(name) + '" limit 1', 'id', cur) if len(eidList) > 0: personDict[name] = eidList[0] totalEID += eidList print('comppleted: ' + name) paperAuthorDict[title].append(name) else: print('error!!!!!!1 ' + name) print('-----------completed:' + title) #break totalEID = list(set(totalEID)) constructSeriz(totalEID_pickle, totalEID) constructSeriz(personDict_pickle, personDict) constructSeriz(paperAuthorDict_pickle, paperAuthorDict) print('completed Paper') #reviewer部分 for name in reviewerDict.keys(): #print(reviewerDict[name]) Institu = reviewerDict[name][0] eidList = getResultList( 'select distinct eid from experience1 where eid in (select id from dlurl1 where name like "' + cleanName(name) + '") and institution like "' + Institu + '" limit 1', 'eid', cur) if len(eidList) > 0: reviewer_ID[name] = eidList[0] totalEID += eidList print('comppleted: ' + name) else: eidList = getResultList( 'select id from dlurl1 where name like "' + cleanName(name) + '" limit 1', 'id', cur) if len(eidList) > 0: reviewer_ID[name] = eidList[0] totalEID += eidList print('comppleted: ' + name) else: print('error!!!!!!1 ' + name) reviewer_ID[name] = -1 #break constructSeriz(totalEID_pickle, totalEID) constructSeriz(reviewer_ID_pickle, reviewer_ID)
def measure_topicSIM(topicsNum=30): # paperDictTopic = readSeriz(paperDictTopic_pickle) reviewerDictTopic = readSeriz(reviewerDictTopic_pickle) wordOfBagSet = [] i = 0 paperID = {} #通过wordofbad的序号,索引paperID列表,然后知道这是paper还是reviewer的paper for title in paperDictTopic: wordOfBagSet.append(paperDictTopic[title]) paperID[i] = (True, title) i += 1 for reviewer in reviewerDictTopic: for title in reviewerDictTopic[reviewer]: wordOfBagSet.append(title) paperID[i] = (False, title, reviewer) i += 1 texts = wordOfBagSet[:] dictionary = corpora.Dictionary(texts) #dictionary.save('F:/newsAnalysis/data/newswordsall.dict') corpus = [dictionary.doc2bow(text) for text in texts] #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ldaModel = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=topicsNum) ldaModel.show_topics() corpus_lda = ldaModel[corpus] #得到各文本的倾向 index = similarities.MatrixSimilarity(ldaModel[corpus_lda]) #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) #corpus_lda = [[(d[0],round(float(d[1]),5)) for d in doc] for doc in corpus_lda0] #for doc in corpus_lda: # print(doc) ''' #参考部分 doc = "Human computer interaction" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = ldaModel[vec_bow] # convert the query to LSI space index = similarities.MatrixSimilarity(ldaModel[corpus_lda]) sims = index[vec_lsi] print(sims) ''' ''' paperList = list(paperDictT.keys()) reviewerList = list(reviewerDictT.keys()) TOPIC = [[0 for j in range(len(reviewerList)) ] for i in range(len(paperList))] paperDictT = {} #reviewerDictT = {} for i in range(len(corpus_lda)): if paperID[i][0] == True: #投稿文章 {title:[1,2,3,4,5],} #paperDictT[paperID[i][1]] = corpus_lda[i] paperDictT[paperID[i][1]] = else: #pass if paperID[i][2] in reviewerDictT.keys(): reviewerDictT[paperID[i][2]].append(corpus_lda[i]) else: reviewerDictT[paperID[i][2]] = [] reviewerDictT[paperID[i][2]].append(corpus_lda[i]) ''' corpusLen = 0 TOPIC = [[0 for j in range(len(reviewerList))] for i in range(len(paperList))] #metrx = [[0 for i in range(corpusLen)] for ] for i in range(len(paperList)): paperList[i] corpus_lda_vector = paperDictT[paperList[i]] sims = index[corpus_lda_vector] for j in range(len(reviewerList)): TOPIC[i][j] = sims[j] return #print(paperDictT_pickle) constructSeriz(paperDictT_pickle, paperDictT) constructSeriz(reviewerDictT_pickle, reviewerDictT) constructSeriz(corpus_lda_pickle, corpus_lda)
def basePrepare(): #expertNet xid,yid,status,attr #selectSQL = 'select id from dlurl1' #idList = getResultList(selectSQL,'id',cur) #constructSeriz(idList_pickle,idList) #print('read complete!') #构建学者的id网 idList = readSeriz(idList_pickle) idLength = len(idList) #expertNet = [[0 for j in range(idLength)] for i in range(idLength)] #用于最后的加和 for i in range(idLength): for j in range() insertSQL = 'insert into expertNet (xid,yid) values ('+str(idList[i])+','+str(yid)+')' #序列化 constructSeriz(expertNet_pickle,expertNet) del expertNet gc.collect() print('all complete') def buildInstitution(): # colleaguenet xid,yid,score ''' institutionNet = readSeriz(expertNet_pickle) institutionGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #iid还没有设置,应该是status之类的项 xResult = getResult('select iid from experience1 where eid='+str(idList[i]),cur) #需要有个pid list 建立一个字典 key是id,time是内容 xDict = {} for x in xResult: xDict[x['id']] = x['time'] xID = xDict.keys() totalSet.append((xDict,xID)) #这里与paper/coauthor不一样,不用进行一次控制,可以直接放入,类似coauthornet那种 for i in range(len(idList)): for j in range((i+1),len(idList)): institutionList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(institutionList)>0: #全部插入数据库中 score = calculateColleague(institutionList) #xid yid 已经转成str insertSQL = 'insert into colleaguenet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() institutionNet[i][j] = score #这里也做一个记录 institutionGraph(*(i,j),coinstitution=institutionList, weight=score) constructSeriz(institutionNet_pickle,institutionNet) constructSeriz(institutionGraph_pickle,institutionGraph) ''' def calculateColleague(institutionList): #具体的计算方法 pass def buildFinalGraph(): #对两个图进行加和 #coauthorGraph = readSeriz(coauthorGraph_pickle) #institutionGraph = readSeriz(institutionGraph_pickle) finalLen = len(coauthorNet) coauthorNet = readSeriz(coauthorNet_pickle) institutionNet = readSeriz(institutionNet_pickle) finalNet = readSeriz(expertNet_pickle) #初始化一个模板 finalGraph = nx.Graph() for i in range(finalLen): for j in range((i+1),finalLen): if i==j: continue finalNet[i][j] = coauthorNet[i][j] + institutionNet[i][j] finalNet[j][i] = finalNet[i][j] if finalNet[i][j] != 0: finalGraph.add_edge(*(i,j),score=finalNet[i][j]) constructSeriz(finalGraph_pickle,finalGraph) def mainFunction(): # basePrepare() buidPaperNet() buildCoauthorByYear(2012) #定下大于多少年 buildInstitution() def analysisGraph(finalGraph): if nx.is_connected(finalGraph): print('Yes') else: number_connected_components(finalGraph) nx.connected_component_subgraphs(finalGraph) #这个返回的是什么类型? def buildTopic(): # colleaguenet xid,yid,score topicNet = readSeriz(expertNet_pickle) topicNetMore = readSeriz(expertNet_pickle) topicGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #other还没有设置,相当于tid xResult = getResult('select other from topic where eid='+str(idList[i]),cur) #需要有个other list 建立一个字典 key是other,num是内容 xDict = {} for x in xResult: xDict[x['other']] = x['num'] xID = xDict.keys() totalSet.append((xDict,xID)) for i in range(len(idList)): for j in range((i+1),len(idList)): topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(topicList)>0: #全部插入数据库中 score = calculateTopic(topicList) #xid yid 已经转成str insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() topicNet[i][j] = score #这里也做一个记录 topicGraph(*(i,j),similar=score) topicNetMore[i][j] = topicList constructSeriz(topicNet_pickle,topicNet) constructSeriz(topicGraph_pickle,topicGraph) constructSeriz(topicNetMore_pickle,topicNetMore) def calculateTopic(): pass if __name__ == '__main__': basePrepare()
def selectEID(): #authours X reviewers 根据eid 找到其合作者 totalEID = readSeriz(totalEID_new_pickle) ''' Dict = readSeriz(reviewer_ID_pickle) #Dict = readSeriz(paperAuthorDict_pickle) #这里包有[] totalEID = list(set(totalEID)) print('begin totleEID len is '+str(len(totalEID))) for name in Dict.keys(): eid = Dict[name] if eid ==-1: continue selectEIDList1 = [] selectEIDList2 = [eid] #下一次要查询的列表 selectEIDCompleted = [] #已经完成查询的列表 for i in range(1): if len(selectEIDList2)==0: break selectEIDList1 = selectEIDList2[:] #print(selectEIDList1) selectEIDList2 = [] for j in selectEIDList1: print('--query: '+str(j)) selectSQL = 'select distinct eid from tmp_paper where paperid in (select distinct paperid from tmp_paper where eid='+str(j)+' ) limit 100' newEidList = getResultList(selectSQL,'eid',cur) newList = 0 for kid in newEidList: if (not kid in selectEIDCompleted) and (not kid in totalEID) and (not kid in selectEIDList2): selectEIDList2.append(kid) newList += 1 if newList>300: break selectEIDCompleted.append(j) print('++add: '+str(newList)) totalEID += selectEIDList2 totalEID = list(set(totalEID)) totalEID = list(set(totalEID)) #print('selectEIDList2 len is '+str(len(selectEIDList2))) #print('now len is '+str(len(selectEIDCompleted))) print('===========completed: '+name) print('now totleEID len is '+str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle,totalEID) print('total totleEID len is '+str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle,totalEID) ''' DictAuthor = readSeriz(paperAuthorDict_pickle) #这里包有[] DictID = readSeriz(personDict_pickle) #这里包有[] totalEID = list(set(totalEID)) print('begin totleEID len is ' + str(len(totalEID))) for title in DictAuthor.keys(): names = DictAuthor[title] #print(eid) if len(names) == 0: continue selectEIDList1 = [] selectEIDList2 = [DictID[na] for na in names] #下一次要查询的列表 selectEIDCompleted = [] #已经完成查询的列表 for i in range(1): if len(selectEIDList2) == 0: break selectEIDList1 = selectEIDList2[:] #print(selectEIDList1) selectEIDList2 = [] for j in selectEIDList1: print('--query: ' + str(j)) selectSQL = 'select distinct eid from tmp_paper where paperid in (select distinct paperid from tmp_paper where eid=' + str( j) + ' ) limit 100' newEidList = getResultList(selectSQL, 'eid', cur) newList = 0 for kid in newEidList: if (not kid in selectEIDCompleted) and ( not kid in totalEID) and (not kid in selectEIDList2): selectEIDList2.append(kid) newList += 1 if newList > 300: break selectEIDCompleted.append(j) print('++add: ' + str(newList)) totalEID += selectEIDList2 totalEID = list(set(totalEID)) totalEID = list(set(totalEID)) #print('selectEIDList2 len is '+str(len(selectEIDList2))) #print('now len is '+str(len(selectEIDCompleted))) print('===========completed: ' + title) print('now totleEID len is ' + str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle, totalEID) #break print('total totleEID len is ' + str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle, totalEID)