def __init__(self, src_corp_dir, lda_doc, docsuffix):
        self.src_corp_dir = src_corp_dir
        self.lda_doc = lda_doc
        self.docsuffix = docsuffix
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.doclist = []
        self.getdoclist()
        self.genmodel_inputfile()
class GetTopicDis:
    ''' Call ldaGibbs++, the opensource software, to get the topic
        distribution for words and documents. Before calling, we
        should first convert the file to the format which meets
        the requirements.
    '''
    def __init__(self, src_corp_dir, lda_doc, docsuffix):
        self.src_corp_dir = src_corp_dir
        self.lda_doc = lda_doc
        self.docsuffix = docsuffix
        self.stopWords = Stopwords()
        self.stemmer = PorterStemmer()
        self.pattern = re.compile(REG_EXP)

        self.doclist = []
        self.getdoclist()
        self.genmodel_inputfile()

    def getdoclist(self):
        for subdir in self.src_corp_dir:
            candi_files = os.listdir(subdir)
            # filter file by suffix if existed
            if self.docsuffix:
                candi_files = filter(self.substr, candi_files)
            candi_files = map(lambda x: subdir+x, candi_files)
            self.doclist = self.doclist + candi_files

    # generate the file meeting the requirements of the ldaGibbs++
    def genmodel_inputfile(self):
        wfd = open(self.lda_doc, "w")
        wfd.write("%d\n" % len(self.doclist))
        for doc in self.doclist:
            docwordlist = []
            for line in open(doc):
                line = line.strip("\n\r ")
                docwordlist = docwordlist + self.filterwords(line)
            docwordlist = sorted(docwordlist, reverse=False)
            wfd.write("%s\n" % " ".join(docwordlist))
        wfd.close()

    # call Gibbs LDA
    def call_lda(self, topicnum, maxiter):
        # lda model parameter setting
        alpha = 1.0*topicnum / 50
        cmd = CMD + "-alpha " + str(alpha) + " -ntopics " + \
                str(topicnum) + " -niters " + str(maxiter) + \
                " -dfile " + self.lda_doc
        print "Calling Gibbs LDA"
        #os.popen(cmd)
        os.system(cmd)
        print "Finishing calling"

    # filter words based on stopwords list and character rule
    def filterwords(self, textline):
        save_words = []
        words = textline.split(" ")
        for word in words:
            if word == " ":
                continue
            biparts = word.split("_")
            # words processing (stopword, stemming, lower)
            # ============================================
            biparts[0] = biparts[0].lower()
            biparts[0] = self.stemmer.stem(biparts[0], 0, \
                    len(biparts[0])-1)
            if len(biparts) == 2 and biparts[1] in TOPIC_POS:
                if not self.stopWords.is_stopword(biparts[0])\
                        and self.pattern.match(biparts[0]):
                    save_words.append(biparts[0])
            # ============================================
        return save_words

    def substr(self, candi_file):
        if candi_file.find(self.docsuffix) != -1:
            return True
        return False