class newVec(): """docstring for newVec""" def __init__(self): self.redis = RedisHelper() def reSaveNewVec(self): authors = self.redis.getAuthorList() index = 0 for author in authors: if index % 1000 == 0: print index index += 1 vecSet = self.redis.getAuthorVec(author) # print vecSet newVec = self.ProcWithTimeAndTopN(vecSet) for item, value in newVec.items(): self.redis.addAuthorNewVec(author, item + ':' + str(value)) def ProcWithTimeAndTopN(self, VecSet): d = dict() for item in VecSet: topic = item.split(':')[0] year = item.split(':')[1] value = item.split(':')[2] if int(year) >= PAPER_START_YEAR: d[topic + ':' + year] = float(value) d = sorted(d.iteritems(), key = lambda d:d[1], reverse = True)[:5] d = {item[0] : TimeFunctionLog10(int(item[0].split(':')[1])) * item[1] \ for item in d} return d
def __init__(self): logging.info("init starting...") self.redis = RedisHelper() self.docs = list() for line in open(PATH_DOC_AUTHOR, 'r'): self.docs.append(line.strip('\n').split()) logging.info("init ending...")
class Docs(): """docstring for Docs""" def __init__(self): logging.info('conduct docs firstly') self.redis = RedisHelper() def conductDocs(self): fileWtriter = file(PATH_DOC_AUTHOR, 'w') authorList = self.redis.getAuthorList() authorDoc = dict() # year-->docs. the docs of an author in every year index = 0 for author in authorList: authorDoc = {} papers = self.redis.getAuthorPapers(author) for paper in papers: year = self.redis.getPaperYear(paper) if int(year) <= TEST_DATA_YEAR: # we only use the data in ten years content = self.redis.getPaperAbstract(paper) if len(content) < 3: # if there is no abstract,return title content = self.redis.getPaperTitle(paper) doc = authorDoc.setdefault(year, "") authorDoc[year] = doc + content for year, doc in authorDoc.items(): if index % 10000 == 0: print index fileWtriter.write(doc + '\n') self.redis.addDocAuthorYear(index, author, year) index += 1 fileWtriter.close()
class baselda(): """docstring for baselda""" def __init__(self): logging.info("init starting...") self.redis = RedisHelper() self.docs = list() for line in open(PATH_DOC_AUTHOR, 'r'): self.docs.append(line.strip('\n').split()) logging.info("init ending...") def lda_setp1(self): '''Step1''' dictionary = corpora.Dictionary(self.docs) logging.info("store the dictionary, for future reference.") dictionary.save_as_text(PATH_LDA_DIC) corpus = [dictionary.doc2bow(doc) for doc in self.docs] logging.info("store to disk, for later use.") corpora.MmCorpus.serialize(PATH_LDA_MM, corpus) def lda_step2(self): '''Step2''' logging.info("load Dictionary.") id2word = corpora.Dictionary.load_from_text(PATH_LDA_DIC) logging.info("load corpus iterator.") mm = corpora.MmCorpus(PATH_LDA_MM) logging.info('LDA Start.') lda = models.ldamodel.LdaModel(corpus=mm, id2word=id2word, \ num_topics=LDA_CLUSTER_NUM, update_every=1, chunksize=10000, passes=1) logging.info('LDA End') corpus_lda = list(lda[mm]) self.saveVec(corpus_lda) def saveVec(self, corpus_lda): print len(corpus_lda) for DocId in range(len(corpus_lda)): # print DocId author, year = self.redis.getDocAuthorYear(DocId).split(':') for topic, value in corpus_lda[DocId]: self.redis.addAuthorVec(author, str(topic) + ':' + str(year) + \ ':' + str(value)) self.docs = [] corpus_lda = []
def __init__(self): self.redis = RedisHelper() self.title = str() self.authors = list() self.year = 1970 self.venue = str() self.paperID = 0 self.references = list() self.abstract = str() self.authorCoauthor = dict() self.count = 0 logging.info('starting.....')
def __init__(self): self.redis = RedisHelper()
def __init__(self): logging.info('conduct docs firstly') self.redis = RedisHelper()
class ReadData(): """Store the data into redis""" def __init__(self): self.redis = RedisHelper() self.title = str() self.authors = list() self.year = 1970 self.venue = str() self.paperID = 0 self.references = list() self.abstract = str() self.authorCoauthor = dict() self.count = 0 logging.info('starting.....') def clearVar(self): self.title = "" self.venue = "" self.abstract = "" self.authors = [] self.references = [] self.authorCoauthor = {} self.year = 1970 def read(self): with open(PATH_DATASET_TXT) as fileReader: logging.info('reading.....') for line in fileReader: if line[0] != '#': self.count += 1 if self.count % 10000 == 0: logging.info(self.count) self.save2Redis() self.clearVar() elif line[1] == '*': # title self.title = line.strip('\n\r')[2:] elif line[1] == '@': # authors self.authors.extend(line.strip('\n\r')[2:].split(',')) elif line[1] == 't': # year self.year = line.strip()[2:] elif line[1] == 'c': # venue self.venue = line.strip('\n\r')[2:] elif line[1] == 'i': # paperID self.paperID = line.strip()[6:] elif line[1] == '%': # references self.references.append(line.strip()[2:]) elif line[1] == '!': # abstract self.abstract = line.strip('\n\r')[2:] def save2Redis(self): if re.search(rewords, self.venue): if self.paperID != '' and self.year != '': if int(self.year) >= PAPER_START_YEAR: self.redis.addPaperYear(self.paperID, self.year) self.redis.addPaperVenue(self.paperID, self.venue) self.redis.addPaperTitle(self.paperID, self.title) self.redis.addPaperAbstract(self.paperID, self.abstract) if self.references[0] != '': for reference in self.references: self.redis.addPaperReferences(self.paperID, reference) self.redis.addPaperRefered(reference, self.paperID) for author in self.authors: self.redis.addAuthorPapers(author, self.paperID) self.redis.addPaperAuthors(self.paperID, author) if len(self.authors) > 1: for i in range(len(self.authors)): self.redis.addAuthorPapers(self.authors[i], self.paperID) for j in range(i + 1, len(self.authors)): self.redis.addAuthorCoauthor(self.authors[i], self.authors[j]) self.redis.addAuthorCoauthor(self.authors[j], self.authors[i])