def getLevelWithConfNum(): mRedis = RedisHelper() authors = mRedis.getAllAuthors() confPaperNumDict = dict() confCoauNumDict = dict() index = 0 for author in authors: index += 1 if index % 1000 == 0: logging.info(index) confNum = len(mRedis.getAuConfs(author)) paperNum = len(mRedis.getAuPapers(author)) coauNum = len(mRedis.getAuCoauthors(author)) paperNumList = confPaperNumDict.setdefault(confNum, []) paperNumList.append(paperNum) coauNumList = confCoauNumDict.setdefault(confNum, []) coauNumList.append(coauNum) index = 0 with open(OUTPUT_ACADEMIC_LEVEL_PAPER_COAU_NUM,'w') as fileWriter: for confNum in confPaperNumDict.keys(): index += 1 if index % 1000 == 0: logging.info(index) paperNumList = confPaperNumDict[confNum] avg_P, max_num_P, min_num_P = statisticOfList(paperNumList) coauNumList = confCoauNumDict[confNum] avg_C, max_num_C, min_num_C = statisticOfList(coauNumList) fileWriter.write(str(confNum) + '\t' + str(avg_P) + '\t' + str(max_num_P) + '\t' + str(min_num_P) + '\t' + str(avg_C) + '\t' + str(max_num_C) + '\t' + str(min_num_C) + '\n') fileWriter.close()
def __init__(self): self.G = nx.Graph() self.stars = dict() self.targets = dict() self.loadStarsAndTargets() logging.info('loadStarsAndTargets done---------------') self.shortestPathLength = dict() self.mRedis = RedisHelper() self.buildGraph() logging.info('---------------')
def __init__(self): self.mRedis = RedisHelper() self.isPaperTag = False self.conf = '' self.isTitleTag = False self.paperId = -1 self.isAuthorTag = False self.authors = list() self.isYearTag = False self.year = ''
class dblpHandler(handler.ContentHandler): """docstring for dblpHandler""" def __init__(self): self.mRedis = RedisHelper() self.isPaperTag = False self.conf = '' self.isTitleTag = False self.paperId = -1 self.isAuthorTag = False self.authors = list() self.isYearTag = False self.year = '' def startDocument(self): logging.info('Document start...') def endDocument(self): logging.info('Document End...') def startElement(self, name, attrs): if name in paperLabels: self.conf = attrs.get('key').split('/')[1] self.isPaperTag = True if self.isPaperTag: if name == 'title': self.isTitleTag = True if name == 'author': self.isAuthorTag = True if name == 'year': self.isYearTag = True def endElement(self, name): if name in paperLabels: if self.isPaperTag: self.isPaperTag = False self.mRedis.addPaperItem(self.authors, self.paperId, self.conf, self.year) if self.paperId % 1000 == 0: logging.info(self.paperId) self.conf = '' self.authors = [] self.year = '' def characters(self, content): if self.isTitleTag: self.paperId += 1 self.isTitleTag = False if self.isYearTag: self.year = content self.isYearTag = False if self.isAuthorTag: self.authors.append(content) self.isAuthorTag = False
def getConfPaperNum(): """统计每个会议收录了多少论文。 格式:会议名-->收录论文数 """ index = 0 with open(OUTPUT_STATISTIC_CONF_PAPER_NUM, 'w') as fileWriter: mRedis = RedisHelper() confPaperNum = dict() conferences = mRedis.getAllConfs() for conf in conferences: index += 1 logging.info(str(index) + '--' + conf) paperNum = len(mRedis.getConfPapers(conf)) tmp = confPaperNum.setdefault(paperNum, 0) confPaperNum[paperNum] = tmp + 1 for paperNum, confNum in confPaperNum.items(): fileWriter.write(str(paperNum) + '\t' + str(confNum) + '\n') fileWriter.close()
def extracStarsAndTargets(): mRedis = RedisHelper() starts = dict() targets = dict() authors = mRedis.getAllAuthors() confNumAuthors = dict() index = 0 for author in authors: index += 1 if index % 1000 == 0: logging.info(index) confNum = len(mRedis.getAuConfs(author)) tmp = confNumAuthors.setdefault(confNum, []) tmp.append(author) logging.info('-------') for i in range(1, 101): logging.info(i) confauhtors = confNumAuthors[i] count = 0 while count < 10: au = random.choice(confauhtors) if au not in targets.keys(): targets[au] = i count += 1 count = 0 while count < 100: author = random.choice(authors) confNum = len(mRedis.getAuConfs(author)) if author not in stars and confNum > 100: stars[author] = confNum count += 1 logging.info(count) authors = [] confNumAuthors = {} with open(OUTPUT_STAR_AUTHORS, 'w') as fileWriter: for author, confNum in stars.items(): fileWriter.write(author + '\t' + str(confNum) + '\n') fileWriter.close() with open(OUTPUT_TARGET_AUTHORS, 'w') as fileWriter: for author, confNum in targets.items(): fileWriter.write(author + '\t' + str(confNum) + '\n') fileWriter.close()
def getConfPaperNum(): """统计每个会议收录了多少论文。 格式:会议名-->收录论文数 """ index = 0 with open(OUTPUT_STATISTIC_CONF_PAPER_NUM, 'w') as fileWriter: mRedis = RedisHelper() confPaperNum = dict() conferences =mRedis.getAllConfs() for conf in conferences: index += 1 logging.info(str(index) + '--' + conf) paperNum = len(mRedis.getConfPapers(conf)) tmp = confPaperNum.setdefault(paperNum, 0) confPaperNum[paperNum] = tmp + 1 for paperNum, confNum in confPaperNum.items(): fileWriter.write(str(paperNum) + '\t' + str(confNum) + '\n') fileWriter.close()
def getAuthorPaperNum(): """统计每个学者发表了多少论文。 格式:论文数-->学者数 """ index = 0 with open(OUTPUT_STATISTIC_AUTHOR_PAPER_NUM, 'w') as fileWriter: mRedis = RedisHelper() authors = mRedis.getAllAuthors() authorPaperNum = dict() for author in authors: index += 1 if index % 1000 == 0: logging.info(index) paperNum = len(mRedis.getAuPapers(author)) tmp = authorPaperNum.setdefault(paperNum, 0) authorPaperNum[paperNum] = tmp + 1 for k, v in authorPaperNum.items(): fileWriter.write(str(k) + '\t' + str(v) + '\n') fileWriter.close()
def getAuthorConfNum(): """统计每个学者的参会数。 格式:参会数-->学者数 """ index = 0 with open(OUTPUT_STATISTIC_AUTHOR_CONF_NUM, 'w') as fileWriter: mRedis = RedisHelper() authors = mRedis.getAllAuthors() authorConfNum = dict() for author in authors: index += 1 if index % 1000 == 0: logging.info(index) confNum = len(mRedis.getAuConfs(author)) tmp = authorConfNum.setdefault(confNum, 0) authorConfNum[confNum] = tmp + 1 for k, v in authorConfNum.items(): fileWriter.write(str(k) + '\t' + str(v) + '\n') fileWriter.close()
def getLevelWithConfNum(): mRedis = RedisHelper() authors = mRedis.getAllAuthors() confPaperNumDict = dict() confCoauNumDict = dict() index = 0 for author in authors: index += 1 if index % 1000 == 0: logging.info(index) confNum = len(mRedis.getAuConfs(author)) paperNum = len(mRedis.getAuPapers(author)) coauNum = len(mRedis.getAuCoauthors(author)) paperNumList = confPaperNumDict.setdefault(confNum, []) paperNumList.append(paperNum) coauNumList = confCoauNumDict.setdefault(confNum, []) coauNumList.append(coauNum) index = 0 with open(OUTPUT_ACADEMIC_LEVEL_PAPER_COAU_NUM, 'w') as fileWriter: for confNum in confPaperNumDict.keys(): index += 1 if index % 1000 == 0: logging.info(index) paperNumList = confPaperNumDict[confNum] avg_P, max_num_P, min_num_P = statisticOfList(paperNumList) coauNumList = confCoauNumDict[confNum] avg_C, max_num_C, min_num_C = statisticOfList(coauNumList) fileWriter.write( str(confNum) + '\t' + str(avg_P) + '\t' + str(max_num_P) + '\t' + str(min_num_P) + '\t' + str(avg_C) + '\t' + str(max_num_C) + '\t' + str(min_num_C) + '\n') fileWriter.close()
class CollaborationFeature(object): """docstring for CollaborationFeature""" def __init__(self): self.mRedis = RedisHelper() def isConfLeadCollab(self, author, conf): authorConfTime = self.mRedis.getAuConfTimes(author, conf) confAuthors = self.mRedis.getConfAuthors(conf) coAuthors = self.mRedis.getAuCoauthors(author) sameConfCoaus = set(confAuthors) & set(coAuthors) if len(sameConfCoaus) != 0: for coau in sameConfCoaus: coauConfTime = self.mRedis.getAuConfTimes(coau, conf) if authorConfTime == coauConfTime: coauTime = self.mRedis.getAuCoauTimes(author, coau) if min(coauTime) > min(authorConfTime): coauTime = [] coauConfTime = [] authorConfTime = [] confAuthors = [] coAuthors = [] sameConfCoaus = [] return True coauTime = [] coauConfTime = [] authorConfTime = [] confAuthors = [] coAuthors = [] sameConfCoaus = [] return False def isCoauLeadByConf(self, author, coau): auCoauTimes = self.mRedis.getAuCoauTimes(author, coau) authorConfs = self.mRedis.getAuConfs(author) coauConfs = self.mRedis.getAuConfs(coau) sameConfs = set(authorConfs) & set(coauConfs) if len(sameConfs) > 1: confTimes = list() for conf in sameConfs: confTimes.extend(self.mRedis.getAuConfTimes(author, conf)) if min(auCoauTimes) > min(confTimes): auCoauTimes = [] authorConfs = [] coauConfs = [] sameConfs = [] confTimes = [] return True auCoauTimes = [] authorConfs = [] coauConfs = [] sameConfs = [] confTimes = [] return False def getConfLeadCollabProb(self): confCountCLCPDictList = dict() authors = self.mRedis.getAllAuthors() authorDict = dict() index = 0 while index < 200000: author = random.choice(authors) if authorDict.has_key(author): continue authorDict[author] = True authorConfs = self.mRedis.getAuConfs(author) ConfCnt = len(authorConfs) if ConfCnt < 2: continue index += 1 if index % 1000 == 0: logging.info(index) CLCPNum = 0 for conf in authorConfs: if self.isConfLeadCollab(author, conf): CLCPNum += 1 tmp = confCountCLCPDictList.setdefault(ConfCnt, []) tmp.append(CLCPNum * 1.0 / ConfCnt) authorConfs = [] authors = [] with open(OUTPUT_COLLAB_CONF_LEAD_COLLAB_PROB, 'w') as fileWriter: for k, v in confCountCLCPDictList.items(): if len(v) == 0: avg = 0 else: avg = sum(v) * 1.0 / len(v) fileWriter.write(str(k) + '\t' + str(avg) + '\n') fileWriter.close() confCountCLCPDictList = {} def getCoauLeadByConf(self): ConfCountCoauDictList = dict() authors = self.mRedis.getAllAuthors() authorDict = dict() index = 0 while index < 200000: author = random.choice(authors) if authorDict.has_key(author): continue authorDict[author] = True auCoauthors = self.mRedis.getAuCoauthors(author) authorConfs = self.mRedis.getAuConfs(author) confCnt = len(authorConfs) if confCnt < 2: continue index += 1 if index % 1000 == 0: logging.info(index) CLCsNum = 0 for coau in auCoauthors: if self.isCoauLeadByConf(author, coau): CLCsNum += 1 tmp = ConfCountCoauDictList.setdefault(confCnt, []) tmp.append(CLCsNum) authorConfs = [] auCoauthors = [] authors = [] with open(OUTPUT_COLLAB_COAU_NUM_LEAD_BY_CONF, 'w') as fileWriter: for k, v in ConfCountCoauDictList.items(): if len(v) == 0: avg = 0 else: avg = sum(v) * 1.0 / len(v) fileWriter.write(str(k) + '\t' + str(avg) + '\n') fileWriter.close() ConfCountCoauDictList = {} def getConfLeadPotentialCoaus(self): ConfCountPotentialCoausDict = dict() confAuthorDict = dict() confs = self.mRedis.getAllConfs() for conf in confs: confAuthorDict[conf] = self.mRedis.getConfAuthors(conf) authors = self.mRedis.getAllAuthors() authorDict = dict() index = 0 for author in authors: index += 1 if index % 1000 == 0: logging.info(index) confs = self.mRedis.getAuConfs(author) confCnt = len(confs) potentialCoaus = list() for conf in confs: potentialCoaus.extend(confAuthorDict[conf]) coAuthors = self.mRedis.getAuCoauthors(author) PotenCoauNum = len(set(potentialCoaus) - set(coAuthors)) tmp = ConfCountPotentialCoausDict.setdefault(confCnt, []) tmp.append(PotenCoauNum) confs = [] potentialCoaus = [] coAuthors = [] authors = [] confAuthorDict = {} with open(OUTPUT_COLLAB_CONF_LEAD_POTENRIAL_COAU, 'w') as fileWriter: for k, v in ConfCountPotentialCoausDict.items(): if len(v) == 0: avg = 0 else: avg = sum(v) * 1.0 / len(v) fileWriter.write(str(k) + '\t' + str(avg) + '\n') fileWriter.close() ConfCountPotentialCoausDict = {}
def __init__(self): self.mRedis = RedisHelper()
class BaconNumber(object): """docstring for BaconNumber""" def __init__(self): self.G = nx.Graph() self.stars = dict() self.targets = dict() self.loadStarsAndTargets() logging.info('loadStarsAndTargets done---------------') self.shortestPathLength = dict() self.mRedis = RedisHelper() self.buildGraph() logging.info('---------------') def buildGraph(self): authors = self.mRedis.getAllAuthors() index = 0 for author in authors: index += 1 if index % 1000 == 0: logging.info(index) coaus = self.mRedis.getAuCoauthors(author) confs = self.mRedis.getAuConfs(author) for coau in coaus: self.G.add_edge(author, coau) for conf in confs: self.G.add_edge(author, conf) def getGraphNodeCount(self): return len(self.G.nodes()) def getGraphEdgeCount(self): return len(self.G.edges()) def shortestPath(self, s, t): return nx.shortest_path_length(self.G, s, t) def getShortestPathLength(self): self.targets = dict(sorted(self.targets.iteritems(), key = lambda d:d[1])) index = 0 for author, confNum in self.targets.items(): for star in self.stars.keys(): try: length = self.shortestPath(author, star) except: length = -1 tmp = self.shortestPathLength.setdefault(author, []) tmp.append(length) index += 1 logging.info(str(index) + '---' + str(length)) with open(OUTPUT_AUTHORS_BACOM_NUM, 'w') as fileWriter: nodeCount = self.getGraphNodeCount() edgesCount = self.getGraphEdgeCount() fileWriter.write('nodes:' + str(nodeCount) + '\t' + 'edges:' + str(edgesCount) + '\n') logging.info('nodes:' + str(nodeCount) + '\t' + 'edges:' + str(edgesCount) + '\n') for author, bacons in self.shortestPathLength.items(): baconStr = '' count, sumB, avg = 0, 0.0, 0.0 for bacon in bacons: baconStr += str(bacon) + '\t' if bacon > 0 and bacon < 10000: sumB += bacon count += 1 avg = 0 if count == 0 else sumB * 1.0 / count sb = author + '\t' + str(self.targets[author]) + '\t' + str(avg) + '\t' + baconStr + '\n' fileWriter.write(sb) fileWriter.close() self.shortestPathLength = {} self.G = None def loadStarsAndTargets(self): with open(OUTPUT_STAR_AUTHORS) as fileReader: for line in fileReader: star = line.split('\t')[0] confNum = line.split('\t')[1] self.stars[star] = confNum fileReader.close() with open(OUTPUT_TARGET_AUTHORS) as fileReader: for line in fileReader: target = line.split('\t')[0] confNum = line.split('\t')[1] self.targets[target] = confNum fileReader.close()