def __init__(self, src_corp_dir, lda_doc, docsuffix): self.src_corp_dir = src_corp_dir self.lda_doc = lda_doc self.docsuffix = docsuffix self.stopWords = Stopwords() self.stemmer = PorterStemmer() self.pattern = re.compile(REG_EXP) self.doclist = [] self.getdoclist() self.genmodel_inputfile()
class GetTopicDis: ''' Call ldaGibbs++, the opensource software, to get the topic distribution for words and documents. Before calling, we should first convert the file to the format which meets the requirements. ''' def __init__(self, src_corp_dir, lda_doc, docsuffix): self.src_corp_dir = src_corp_dir self.lda_doc = lda_doc self.docsuffix = docsuffix self.stopWords = Stopwords() self.stemmer = PorterStemmer() self.pattern = re.compile(REG_EXP) self.doclist = [] self.getdoclist() self.genmodel_inputfile() def getdoclist(self): for subdir in self.src_corp_dir: candi_files = os.listdir(subdir) # filter file by suffix if existed if self.docsuffix: candi_files = filter(self.substr, candi_files) candi_files = map(lambda x: subdir+x, candi_files) self.doclist = self.doclist + candi_files # generate the file meeting the requirements of the ldaGibbs++ def genmodel_inputfile(self): wfd = open(self.lda_doc, "w") wfd.write("%d\n" % len(self.doclist)) for doc in self.doclist: docwordlist = [] for line in open(doc): line = line.strip("\n\r ") docwordlist = docwordlist + self.filterwords(line) docwordlist = sorted(docwordlist, reverse=False) wfd.write("%s\n" % " ".join(docwordlist)) wfd.close() # call Gibbs LDA def call_lda(self, topicnum, maxiter): # lda model parameter setting alpha = 1.0*topicnum / 50 cmd = CMD + "-alpha " + str(alpha) + " -ntopics " + \ str(topicnum) + " -niters " + str(maxiter) + \ " -dfile " + self.lda_doc print "Calling Gibbs LDA" #os.popen(cmd) os.system(cmd) print "Finishing calling" # filter words based on stopwords list and character rule def filterwords(self, textline): save_words = [] words = textline.split(" ") for word in words: if word == " ": continue biparts = word.split("_") # words processing (stopword, stemming, lower) # ============================================ biparts[0] = biparts[0].lower() biparts[0] = self.stemmer.stem(biparts[0], 0, \ len(biparts[0])-1) if len(biparts) == 2 and biparts[1] in TOPIC_POS: if not self.stopWords.is_stopword(biparts[0])\ and self.pattern.match(biparts[0]): save_words.append(biparts[0]) # ============================================ return save_words def substr(self, candi_file): if candi_file.find(self.docsuffix) != -1: return True return False