Пример #1
0
def getLevelWithConfNum():
    mRedis = RedisHelper()
    authors = mRedis.getAllAuthors()
    confPaperNumDict = dict()
    confCoauNumDict = dict()
    index = 0
    for author in authors:
        index += 1
        if index % 1000 == 0:
            logging.info(index)
        confNum = len(mRedis.getAuConfs(author))
        paperNum = len(mRedis.getAuPapers(author))
        coauNum = len(mRedis.getAuCoauthors(author))
        paperNumList = confPaperNumDict.setdefault(confNum, [])
        paperNumList.append(paperNum)
        coauNumList = confCoauNumDict.setdefault(confNum, [])
        coauNumList.append(coauNum)
    index = 0
    with open(OUTPUT_ACADEMIC_LEVEL_PAPER_COAU_NUM,'w') as fileWriter:
        for confNum in confPaperNumDict.keys():
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            paperNumList = confPaperNumDict[confNum]
            avg_P, max_num_P, min_num_P = statisticOfList(paperNumList)
            coauNumList = confCoauNumDict[confNum]
            avg_C, max_num_C, min_num_C = statisticOfList(coauNumList)
            fileWriter.write(str(confNum) + '\t'
                + str(avg_P) + '\t' + str(max_num_P) + '\t' + str(min_num_P) + '\t'
                + str(avg_C) + '\t' + str(max_num_C) + '\t' + str(min_num_C) + '\n')
    fileWriter.close()
Пример #2
0
 def __init__(self):
     self.G = nx.Graph()
     self.stars = dict()
     self.targets = dict()
     self.loadStarsAndTargets()
     logging.info('loadStarsAndTargets done---------------')
     self.shortestPathLength = dict()
     self.mRedis = RedisHelper()
     self.buildGraph()
     logging.info('---------------')
Пример #3
0
 def __init__(self):
     self.mRedis = RedisHelper()
     self.isPaperTag = False
     self.conf = ''
     self.isTitleTag = False
     self.paperId = -1
     self.isAuthorTag = False
     self.authors = list()
     self.isYearTag = False
     self.year = ''
Пример #4
0
class dblpHandler(handler.ContentHandler):
    """docstring for dblpHandler"""
    def __init__(self):
        self.mRedis = RedisHelper()
        self.isPaperTag = False
        self.conf = ''
        self.isTitleTag = False
        self.paperId = -1
        self.isAuthorTag = False
        self.authors = list()
        self.isYearTag = False
        self.year = ''

    def startDocument(self):
        logging.info('Document start...')

    def endDocument(self):
        logging.info('Document End...')

    def startElement(self, name, attrs):
        if name in paperLabels:
            self.conf = attrs.get('key').split('/')[1]
            self.isPaperTag = True
        if self.isPaperTag:
            if name == 'title':
                self.isTitleTag = True
            if name == 'author':
                self.isAuthorTag = True
            if name == 'year':
                self.isYearTag = True

    def endElement(self, name):
        if name in paperLabels:
            if self.isPaperTag:
                self.isPaperTag = False
                self.mRedis.addPaperItem(self.authors, self.paperId, self.conf,
                                         self.year)
                if self.paperId % 1000 == 0:
                    logging.info(self.paperId)
                self.conf = ''
                self.authors = []
                self.year = ''

    def characters(self, content):
        if self.isTitleTag:
            self.paperId += 1
            self.isTitleTag = False
        if self.isYearTag:
            self.year = content
            self.isYearTag = False
        if self.isAuthorTag:
            self.authors.append(content)
            self.isAuthorTag = False
Пример #5
0
class dblpHandler(handler.ContentHandler):
    """docstring for dblpHandler"""
    def __init__(self):
        self.mRedis = RedisHelper()
        self.isPaperTag = False
        self.conf = ''
        self.isTitleTag = False
        self.paperId = -1
        self.isAuthorTag = False
        self.authors = list()
        self.isYearTag = False
        self.year = ''

    def startDocument(self):
        logging.info('Document start...')

    def endDocument(self):
        logging.info('Document End...')

    def startElement(self, name, attrs):
        if name in paperLabels:
            self.conf = attrs.get('key').split('/')[1]
            self.isPaperTag = True
        if self.isPaperTag:
            if name == 'title':
                self.isTitleTag = True
            if name == 'author':
                self.isAuthorTag = True
            if name == 'year':
                self.isYearTag = True

    def endElement(self, name):
        if name in paperLabels:
            if self.isPaperTag:
                self.isPaperTag = False
                self.mRedis.addPaperItem(self.authors, self.paperId, self.conf, self.year)
                if self.paperId % 1000 == 0:
                    logging.info(self.paperId)
                self.conf = ''
                self.authors = []
                self.year = ''

    def characters(self, content):
        if self.isTitleTag:
            self.paperId += 1
            self.isTitleTag = False
        if self.isYearTag:
            self.year = content
            self.isYearTag = False
        if self.isAuthorTag:
            self.authors.append(content)
            self.isAuthorTag = False
Пример #6
0
 def __init__(self):
     self.mRedis = RedisHelper()
     self.isPaperTag = False
     self.conf = ''
     self.isTitleTag = False
     self.paperId = -1
     self.isAuthorTag = False
     self.authors = list()
     self.isYearTag = False
     self.year = ''
Пример #7
0
def getConfPaperNum():
    """统计每个会议收录了多少论文。

    格式:会议名-->收录论文数
    """
    index = 0
    with open(OUTPUT_STATISTIC_CONF_PAPER_NUM, 'w') as fileWriter:
        mRedis = RedisHelper()
        confPaperNum = dict()
        conferences = mRedis.getAllConfs()
        for conf in conferences:
            index += 1
            logging.info(str(index) + '--' + conf)
            paperNum = len(mRedis.getConfPapers(conf))
            tmp = confPaperNum.setdefault(paperNum, 0)
            confPaperNum[paperNum] = tmp + 1
        for paperNum, confNum in confPaperNum.items():
            fileWriter.write(str(paperNum) + '\t' + str(confNum) + '\n')
    fileWriter.close()
Пример #8
0
def extracStarsAndTargets():
    mRedis = RedisHelper()
    starts = dict()
    targets = dict()
    authors = mRedis.getAllAuthors()
    confNumAuthors = dict()
    index = 0
    for author in authors:
        index += 1
        if index % 1000 == 0:
            logging.info(index)
        confNum = len(mRedis.getAuConfs(author))
        tmp = confNumAuthors.setdefault(confNum, [])
        tmp.append(author)
    logging.info('-------')
    for i in range(1, 101):
        logging.info(i)
        confauhtors = confNumAuthors[i]
        count = 0
        while count < 10:
            au = random.choice(confauhtors)
            if au not in targets.keys():
                targets[au] = i
                count += 1
    count = 0
    while count < 100:
        author = random.choice(authors)
        confNum = len(mRedis.getAuConfs(author))
        if author not in stars and confNum > 100:
            stars[author] = confNum
            count += 1
            logging.info(count)
    authors = []
    confNumAuthors = {}

    with open(OUTPUT_STAR_AUTHORS, 'w') as fileWriter:
        for author, confNum in stars.items():
            fileWriter.write(author + '\t' + str(confNum) + '\n')
    fileWriter.close()
    with open(OUTPUT_TARGET_AUTHORS, 'w') as fileWriter:
        for author, confNum in targets.items():
            fileWriter.write(author + '\t' + str(confNum) + '\n')
    fileWriter.close()
Пример #9
0
def getConfPaperNum():
    """统计每个会议收录了多少论文。

    格式:会议名-->收录论文数
    """
    index = 0
    with open(OUTPUT_STATISTIC_CONF_PAPER_NUM, 'w') as fileWriter:
        mRedis = RedisHelper()
        confPaperNum = dict()
        conferences =mRedis.getAllConfs()
        for conf in conferences:
            index += 1
            logging.info(str(index) + '--' + conf)
            paperNum = len(mRedis.getConfPapers(conf))
            tmp = confPaperNum.setdefault(paperNum, 0)
            confPaperNum[paperNum] = tmp + 1
        for paperNum, confNum in confPaperNum.items():
            fileWriter.write(str(paperNum) + '\t' + str(confNum) + '\n')
    fileWriter.close()
Пример #10
0
def getAuthorPaperNum():
    """统计每个学者发表了多少论文。

    格式:论文数-->学者数
    """
    index = 0
    with open(OUTPUT_STATISTIC_AUTHOR_PAPER_NUM, 'w') as fileWriter:
        mRedis = RedisHelper()
        authors = mRedis.getAllAuthors()
        authorPaperNum = dict()
        for author in authors:
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            paperNum = len(mRedis.getAuPapers(author))
            tmp = authorPaperNum.setdefault(paperNum, 0)
            authorPaperNum[paperNum] = tmp + 1
        for k, v in authorPaperNum.items():
            fileWriter.write(str(k) + '\t' + str(v) + '\n')
    fileWriter.close()
Пример #11
0
def getAuthorConfNum():
    """统计每个学者的参会数。

    格式:参会数-->学者数
    """
    index = 0
    with open(OUTPUT_STATISTIC_AUTHOR_CONF_NUM, 'w') as fileWriter:
        mRedis = RedisHelper()
        authors = mRedis.getAllAuthors()
        authorConfNum = dict()
        for author in authors:
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            confNum = len(mRedis.getAuConfs(author))
            tmp = authorConfNum.setdefault(confNum, 0)
            authorConfNum[confNum] = tmp + 1
        for k, v in authorConfNum.items():
            fileWriter.write(str(k) + '\t' + str(v) + '\n')
    fileWriter.close()
Пример #12
0
def getAuthorPaperNum():
    """统计每个学者发表了多少论文。

    格式:论文数-->学者数
    """
    index = 0
    with open(OUTPUT_STATISTIC_AUTHOR_PAPER_NUM, 'w') as fileWriter:
        mRedis = RedisHelper()
        authors = mRedis.getAllAuthors()
        authorPaperNum = dict()
        for author in authors:
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            paperNum = len(mRedis.getAuPapers(author))
            tmp = authorPaperNum.setdefault(paperNum, 0)
            authorPaperNum[paperNum] = tmp + 1
        for k, v in authorPaperNum.items():
            fileWriter.write(str(k) + '\t' + str(v) + '\n')
    fileWriter.close()
Пример #13
0
def getAuthorConfNum():
    """统计每个学者的参会数。

    格式:参会数-->学者数
    """
    index = 0
    with open(OUTPUT_STATISTIC_AUTHOR_CONF_NUM, 'w') as fileWriter:
        mRedis = RedisHelper()
        authors = mRedis.getAllAuthors()
        authorConfNum = dict()
        for author in authors:
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            confNum = len(mRedis.getAuConfs(author))
            tmp = authorConfNum.setdefault(confNum, 0)
            authorConfNum[confNum] = tmp + 1
        for k, v in authorConfNum.items():
            fileWriter.write(str(k) + '\t' + str(v) + '\n')
    fileWriter.close()
Пример #14
0
def getLevelWithConfNum():
    mRedis = RedisHelper()
    authors = mRedis.getAllAuthors()
    confPaperNumDict = dict()
    confCoauNumDict = dict()
    index = 0
    for author in authors:
        index += 1
        if index % 1000 == 0:
            logging.info(index)
        confNum = len(mRedis.getAuConfs(author))
        paperNum = len(mRedis.getAuPapers(author))
        coauNum = len(mRedis.getAuCoauthors(author))
        paperNumList = confPaperNumDict.setdefault(confNum, [])
        paperNumList.append(paperNum)
        coauNumList = confCoauNumDict.setdefault(confNum, [])
        coauNumList.append(coauNum)
    index = 0
    with open(OUTPUT_ACADEMIC_LEVEL_PAPER_COAU_NUM, 'w') as fileWriter:
        for confNum in confPaperNumDict.keys():
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            paperNumList = confPaperNumDict[confNum]
            avg_P, max_num_P, min_num_P = statisticOfList(paperNumList)
            coauNumList = confCoauNumDict[confNum]
            avg_C, max_num_C, min_num_C = statisticOfList(coauNumList)
            fileWriter.write(
                str(confNum) + '\t' + str(avg_P) + '\t' + str(max_num_P) +
                '\t' + str(min_num_P) + '\t' + str(avg_C) + '\t' +
                str(max_num_C) + '\t' + str(min_num_C) + '\n')
    fileWriter.close()
Пример #15
0
class CollaborationFeature(object):
    """docstring for CollaborationFeature"""
    def __init__(self):
        self.mRedis = RedisHelper()

    def isConfLeadCollab(self, author, conf):
        authorConfTime = self.mRedis.getAuConfTimes(author, conf)
        confAuthors = self.mRedis.getConfAuthors(conf)
        coAuthors = self.mRedis.getAuCoauthors(author)
        sameConfCoaus = set(confAuthors) & set(coAuthors)
        if len(sameConfCoaus) != 0:
            for coau in sameConfCoaus:
                coauConfTime = self.mRedis.getAuConfTimes(coau, conf)
                if authorConfTime == coauConfTime:
                    coauTime = self.mRedis.getAuCoauTimes(author, coau)
                    if min(coauTime) > min(authorConfTime):
                        coauTime = []
                        coauConfTime = []
                        authorConfTime = []
                        confAuthors = []
                        coAuthors = []
                        sameConfCoaus = []
                        return True
        coauTime = []
        coauConfTime = []
        authorConfTime = []
        confAuthors = []
        coAuthors = []
        sameConfCoaus = []
        return False

    def isCoauLeadByConf(self, author, coau):
        auCoauTimes = self.mRedis.getAuCoauTimes(author, coau)
        authorConfs = self.mRedis.getAuConfs(author)
        coauConfs = self.mRedis.getAuConfs(coau)
        sameConfs = set(authorConfs) & set(coauConfs)
        if len(sameConfs) > 1:
            confTimes = list()
            for conf in sameConfs:
                confTimes.extend(self.mRedis.getAuConfTimes(author, conf))
            if min(auCoauTimes) > min(confTimes):
                auCoauTimes = []
                authorConfs = []
                coauConfs = []
                sameConfs = []
                confTimes = []
                return True
        auCoauTimes = []
        authorConfs = []
        coauConfs = []
        sameConfs = []
        confTimes = []
        return False

    def getConfLeadCollabProb(self):
        confCountCLCPDictList = dict()
        authors = self.mRedis.getAllAuthors()
        authorDict = dict()
        index = 0
        while index < 200000:
            author = random.choice(authors)
            if authorDict.has_key(author):
                continue
            authorDict[author] = True
            authorConfs = self.mRedis.getAuConfs(author)
            ConfCnt = len(authorConfs)
            if ConfCnt < 2:
                continue
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            CLCPNum = 0
            for conf in authorConfs:
                if self.isConfLeadCollab(author, conf):
                    CLCPNum += 1
            tmp = confCountCLCPDictList.setdefault(ConfCnt, [])
            tmp.append(CLCPNum * 1.0 / ConfCnt)
            authorConfs = []
        authors = []
        with open(OUTPUT_COLLAB_CONF_LEAD_COLLAB_PROB, 'w') as fileWriter:
            for k, v in confCountCLCPDictList.items():
                if len(v) == 0:
                    avg = 0
                else:
                    avg = sum(v) * 1.0 / len(v)
                fileWriter.write(str(k) + '\t' + str(avg) + '\n')
        fileWriter.close()
        confCountCLCPDictList = {}

    def getCoauLeadByConf(self):
        ConfCountCoauDictList = dict()
        authors = self.mRedis.getAllAuthors()
        authorDict = dict()
        index = 0
        while index < 200000:
            author = random.choice(authors)
            if authorDict.has_key(author):
                continue
            authorDict[author] = True
            auCoauthors = self.mRedis.getAuCoauthors(author)
            authorConfs = self.mRedis.getAuConfs(author)
            confCnt = len(authorConfs)
            if confCnt < 2:
                continue
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            CLCsNum = 0
            for coau in auCoauthors:
                if self.isCoauLeadByConf(author, coau):
                    CLCsNum += 1
            tmp = ConfCountCoauDictList.setdefault(confCnt, [])
            tmp.append(CLCsNum)
            authorConfs = []
            auCoauthors = []
        authors = []
        with open(OUTPUT_COLLAB_COAU_NUM_LEAD_BY_CONF, 'w') as fileWriter:
            for k, v in ConfCountCoauDictList.items():
                if len(v) == 0:
                    avg = 0
                else:
                    avg = sum(v) * 1.0 / len(v)
                fileWriter.write(str(k) + '\t' + str(avg) + '\n')
        fileWriter.close()
        ConfCountCoauDictList = {}

    def getConfLeadPotentialCoaus(self):
        ConfCountPotentialCoausDict = dict()
        confAuthorDict = dict()
        confs = self.mRedis.getAllConfs()
        for conf in confs:
            confAuthorDict[conf] = self.mRedis.getConfAuthors(conf)
        authors = self.mRedis.getAllAuthors()
        authorDict = dict()
        index = 0
        for author in authors:
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            confs = self.mRedis.getAuConfs(author)
            confCnt = len(confs)
            potentialCoaus = list()
            for conf in confs:
                potentialCoaus.extend(confAuthorDict[conf])
            coAuthors = self.mRedis.getAuCoauthors(author)
            PotenCoauNum = len(set(potentialCoaus) - set(coAuthors))
            tmp = ConfCountPotentialCoausDict.setdefault(confCnt, [])
            tmp.append(PotenCoauNum)
            confs = []
            potentialCoaus = []
            coAuthors = []
        authors = []
        confAuthorDict = {}
        with open(OUTPUT_COLLAB_CONF_LEAD_POTENRIAL_COAU, 'w') as fileWriter:
            for k, v in ConfCountPotentialCoausDict.items():
                if len(v) == 0:
                    avg = 0
                else:
                    avg = sum(v) * 1.0 / len(v)
                fileWriter.write(str(k) + '\t' + str(avg) + '\n')
        fileWriter.close()
        ConfCountPotentialCoausDict = {}
Пример #16
0
 def __init__(self):
     self.mRedis = RedisHelper()
Пример #17
0
class BaconNumber(object):
    """docstring for BaconNumber"""
    def __init__(self):
        self.G = nx.Graph()
        self.stars = dict()
        self.targets = dict()
        self.loadStarsAndTargets()
        logging.info('loadStarsAndTargets done---------------')
        self.shortestPathLength = dict()
        self.mRedis = RedisHelper()
        self.buildGraph()
        logging.info('---------------')

    def buildGraph(self):
        authors = self.mRedis.getAllAuthors()
        index = 0
        for author in authors:
            index += 1
            if index % 1000 == 0:
                logging.info(index)
            coaus = self.mRedis.getAuCoauthors(author)
            confs = self.mRedis.getAuConfs(author)
            for coau in coaus:
                self.G.add_edge(author, coau)
            for conf in confs:
                self.G.add_edge(author, conf)

    def getGraphNodeCount(self):
        return len(self.G.nodes())

    def getGraphEdgeCount(self):
        return len(self.G.edges())

    def shortestPath(self, s, t):
        return nx.shortest_path_length(self.G, s, t)

    def getShortestPathLength(self):
        self.targets = dict(sorted(self.targets.iteritems(), key = lambda d:d[1]))
        index = 0
        for author, confNum in self.targets.items():
            for star in self.stars.keys():
                try:
                    length = self.shortestPath(author, star)
                except:
                    length = -1
                tmp = self.shortestPathLength.setdefault(author, [])
                tmp.append(length)
                index += 1
                logging.info(str(index) + '---' + str(length))
        with open(OUTPUT_AUTHORS_BACOM_NUM, 'w') as fileWriter:
            nodeCount = self.getGraphNodeCount()
            edgesCount = self.getGraphEdgeCount()
            fileWriter.write('nodes:' + str(nodeCount) + '\t' + 'edges:' + str(edgesCount) + '\n')
            logging.info('nodes:' + str(nodeCount) + '\t' + 'edges:' + str(edgesCount) + '\n')
            for author, bacons in self.shortestPathLength.items():
                baconStr = ''
                count, sumB, avg = 0, 0.0, 0.0
                for bacon in bacons:
                    baconStr += str(bacon) + '\t'
                    if bacon > 0 and bacon < 10000:
                        sumB += bacon
                        count += 1
                avg = 0 if count == 0 else sumB * 1.0 / count
                sb = author + '\t' + str(self.targets[author]) + '\t' + str(avg) + '\t' + baconStr + '\n'
                fileWriter.write(sb)
        fileWriter.close()
        self.shortestPathLength = {}
        self.G = None

    def loadStarsAndTargets(self):
        with open(OUTPUT_STAR_AUTHORS) as fileReader:
            for line in fileReader:
                star = line.split('\t')[0]
                confNum = line.split('\t')[1]
                self.stars[star] = confNum
        fileReader.close()
        with open(OUTPUT_TARGET_AUTHORS) as fileReader:
            for line in fileReader:
                target = line.split('\t')[0]
                confNum = line.split('\t')[1]
                self.targets[target] = confNum
        fileReader.close()