Пример #1
0
class newVec():
    """docstring for newVec"""
    def __init__(self):
        self.redis = RedisHelper()

    def reSaveNewVec(self):
        authors = self.redis.getAuthorList()
        index = 0
        for author in authors:
            if index % 1000 == 0:
                print index
            index += 1
            vecSet = self.redis.getAuthorVec(author)
            # print vecSet
            newVec = self.ProcWithTimeAndTopN(vecSet)
            for item, value in newVec.items():
                self.redis.addAuthorNewVec(author, item + ':' + str(value))

    def ProcWithTimeAndTopN(self, VecSet):
        d = dict()
        for item in VecSet:
            topic = item.split(':')[0]
            year = item.split(':')[1]
            value = item.split(':')[2]
            if int(year) >= PAPER_START_YEAR:
                d[topic + ':' + year] = float(value)
        d = sorted(d.iteritems(), key = lambda d:d[1], reverse = True)[:5]
        d = {item[0] : TimeFunctionLog10(int(item[0].split(':')[1])) * item[1] \
            for item in d}
        return d
Пример #2
0
class Docs():
    """docstring for Docs"""
    def __init__(self):
        logging.info('conduct docs firstly')
        self.redis = RedisHelper()

    def conductDocs(self):
        fileWtriter = file(PATH_DOC_AUTHOR, 'w')
        authorList = self.redis.getAuthorList()
        authorDoc = dict() # year-->docs. the docs of an author in every year
        index = 0
        for author in authorList:
            authorDoc = {}
            papers = self.redis.getAuthorPapers(author)
            for paper in papers:
                year = self.redis.getPaperYear(paper)
                if int(year) <= TEST_DATA_YEAR: # we only use the data in ten years
                    content = self.redis.getPaperAbstract(paper)
                    if len(content) < 3: # if there is no abstract,return title
                        content = self.redis.getPaperTitle(paper)
                    doc = authorDoc.setdefault(year, "")
                    authorDoc[year] = doc + content
            for year, doc in authorDoc.items():
                if index % 10000 == 0: print index
                fileWtriter.write(doc + '\n')
                self.redis.addDocAuthorYear(index, author, year)
                index += 1
        fileWtriter.close()