示例#1
0
class newVec():
    """docstring for newVec"""
    def __init__(self):
        self.redis = RedisHelper()

    def reSaveNewVec(self):
        authors = self.redis.getAuthorList()
        index = 0
        for author in authors:
            if index % 1000 == 0:
                print index
            index += 1
            vecSet = self.redis.getAuthorVec(author)
            # print vecSet
            newVec = self.ProcWithTimeAndTopN(vecSet)
            for item, value in newVec.items():
                self.redis.addAuthorNewVec(author, item + ':' + str(value))

    def ProcWithTimeAndTopN(self, VecSet):
        d = dict()
        for item in VecSet:
            topic = item.split(':')[0]
            year = item.split(':')[1]
            value = item.split(':')[2]
            if int(year) >= PAPER_START_YEAR:
                d[topic + ':' + year] = float(value)
        d = sorted(d.iteritems(), key = lambda d:d[1], reverse = True)[:5]
        d = {item[0] : TimeFunctionLog10(int(item[0].split(':')[1])) * item[1] \
            for item in d}
        return d
示例#2
0
 def __init__(self):
     logging.info("init starting...")
     self.redis = RedisHelper()
     self.docs = list()
     for line in open(PATH_DOC_AUTHOR, 'r'):
         self.docs.append(line.strip('\n').split())
     logging.info("init ending...")
示例#3
0
class Docs():
    """docstring for Docs"""
    def __init__(self):
        logging.info('conduct docs firstly')
        self.redis = RedisHelper()

    def conductDocs(self):
        fileWtriter = file(PATH_DOC_AUTHOR, 'w')
        authorList = self.redis.getAuthorList()
        authorDoc = dict() # year-->docs. the docs of an author in every year
        index = 0
        for author in authorList:
            authorDoc = {}
            papers = self.redis.getAuthorPapers(author)
            for paper in papers:
                year = self.redis.getPaperYear(paper)
                if int(year) <= TEST_DATA_YEAR: # we only use the data in ten years
                    content = self.redis.getPaperAbstract(paper)
                    if len(content) < 3: # if there is no abstract,return title
                        content = self.redis.getPaperTitle(paper)
                    doc = authorDoc.setdefault(year, "")
                    authorDoc[year] = doc + content
            for year, doc in authorDoc.items():
                if index % 10000 == 0: print index
                fileWtriter.write(doc + '\n')
                self.redis.addDocAuthorYear(index, author, year)
                index += 1
        fileWtriter.close()
示例#4
0
class baselda():
    """docstring for baselda"""
    def __init__(self):
        logging.info("init starting...")
        self.redis = RedisHelper()
        self.docs = list()
        for line in open(PATH_DOC_AUTHOR, 'r'):
            self.docs.append(line.strip('\n').split())
        logging.info("init ending...")

    def lda_setp1(self):
        '''Step1'''
        dictionary = corpora.Dictionary(self.docs)
        logging.info("store the dictionary, for future reference.")
        dictionary.save_as_text(PATH_LDA_DIC)
        corpus = [dictionary.doc2bow(doc) for doc in self.docs]
        logging.info("store to disk, for later use.")
        corpora.MmCorpus.serialize(PATH_LDA_MM, corpus)

    def lda_step2(self):
        '''Step2'''
        logging.info("load Dictionary.")
        id2word = corpora.Dictionary.load_from_text(PATH_LDA_DIC)
        logging.info("load corpus iterator.")
        mm = corpora.MmCorpus(PATH_LDA_MM)
        logging.info('LDA Start.')
        lda = models.ldamodel.LdaModel(corpus=mm, id2word=id2word, \
            num_topics=LDA_CLUSTER_NUM, update_every=1, chunksize=10000, passes=1)
        logging.info('LDA End')

        corpus_lda = list(lda[mm])
        self.saveVec(corpus_lda)

    def saveVec(self, corpus_lda):
        print len(corpus_lda)
        for DocId in range(len(corpus_lda)):
            # print DocId
            author, year = self.redis.getDocAuthorYear(DocId).split(':')
            for topic, value in corpus_lda[DocId]:
                self.redis.addAuthorVec(author, str(topic) + ':' + str(year) + \
                                        ':' + str(value))
        self.docs = []
        corpus_lda = []
示例#5
0
 def __init__(self):
     self.redis = RedisHelper()
     self.title = str()
     self.authors = list()
     self.year = 1970
     self.venue = str()
     self.paperID = 0
     self.references = list()
     self.abstract = str()
     self.authorCoauthor = dict()
     self.count = 0
     logging.info('starting.....')
示例#6
0
 def __init__(self):
     self.redis = RedisHelper()
示例#7
0
 def __init__(self):
     logging.info('conduct docs firstly')
     self.redis = RedisHelper()
示例#8
0
class ReadData():
    """Store the data into redis"""
    def __init__(self):
        self.redis = RedisHelper()
        self.title = str()
        self.authors = list()
        self.year = 1970
        self.venue = str()
        self.paperID = 0
        self.references = list()
        self.abstract = str()
        self.authorCoauthor = dict()
        self.count = 0
        logging.info('starting.....')

    def clearVar(self):
        self.title = ""
        self.venue = ""
        self.abstract = ""
        self.authors = []
        self.references = []
        self.authorCoauthor = {}
        self.year = 1970

    def read(self):
        with open(PATH_DATASET_TXT) as fileReader:
            logging.info('reading.....')
            for line in fileReader:
                if line[0] != '#':
                    self.count += 1
                    if self.count % 10000 == 0:
                        logging.info(self.count)
                    self.save2Redis()
                    self.clearVar()
                elif line[1] == '*': # title
                    self.title = line.strip('\n\r')[2:]
                elif line[1] == '@': # authors
                    self.authors.extend(line.strip('\n\r')[2:].split(','))
                elif line[1] == 't': # year
                    self.year = line.strip()[2:]
                elif line[1] == 'c': # venue
                    self.venue = line.strip('\n\r')[2:]
                elif line[1] == 'i': # paperID
                    self.paperID = line.strip()[6:]
                elif line[1] == '%': # references
                    self.references.append(line.strip()[2:])
                elif line[1] == '!': # abstract
                    self.abstract = line.strip('\n\r')[2:]

    def save2Redis(self):
        if re.search(rewords, self.venue):
            if self.paperID != '' and self.year != '':
                if int(self.year) >= PAPER_START_YEAR:
                    self.redis.addPaperYear(self.paperID, self.year)
                    self.redis.addPaperVenue(self.paperID, self.venue)
                    self.redis.addPaperTitle(self.paperID, self.title)
                    self.redis.addPaperAbstract(self.paperID, self.abstract)
                    if self.references[0] != '':
                        for reference in self.references:
                            self.redis.addPaperReferences(self.paperID, reference)
                            self.redis.addPaperRefered(reference, self.paperID)
                    for author in self.authors:
                        self.redis.addAuthorPapers(author, self.paperID)
                        self.redis.addPaperAuthors(self.paperID, author)
                    if len(self.authors) > 1:
                        for i in range(len(self.authors)):
                            self.redis.addAuthorPapers(self.authors[i], self.paperID)
                            for j in range(i + 1, len(self.authors)):
                                self.redis.addAuthorCoauthor(self.authors[i], self.authors[j])
                                self.redis.addAuthorCoauthor(self.authors[j], self.authors[i])