示例#1
0
 def modelSelectionLSI(self): 
     """
     Lets find the optimal parameters for LSI for all fields. We see the optimal 
     number of parameters for the training set of experts. 
     """
    
     coverages = numpy.zeros((len(self.ks), len(self.minDfs), len(self.gammas), len(self.fields)))
     logging.getLogger('gensim').setLevel(logging.INFO) 
     maxK = numpy.max(self.ks)
     
     logging.debug("Starting model selection for LSI")       
    
     for t, minDf in enumerate(self.minDfs): 
         logging.debug("Using minDf=" + str(minDf))
         self.minDf = minDf
         
         self.vectoriseDocuments()
         self.loadVectoriser()
         corpus = gensim.corpora.mmcorpus.MmCorpus(self.docTermMatrixFilename + ".mtx")
         id2WordDict = dict(zip(range(len(self.vectoriser.get_feature_names())), self.vectoriser.get_feature_names()))
         
         logging.debug("Running LSI with " + str(maxK) + " dimensions")
         lsi = LsiModel(corpus, num_topics=maxK, id2word=id2WordDict, chunksize=self.chunksize, distributed=False, onepass=False)    
         
         for i, k in enumerate(self.ks): 
             lsi.num_topics = k
             logging.debug("Creating index")
             index = gensim.similarities.docsim.Similarity(self.indexFilename, lsi[corpus], num_features=k)
             
             for j, field in enumerate(self.fields): 
                 logging.debug("k="+str(k) + " and field=" + str(field))                
                 newX = self.vectoriser.transform([field])
                 newX = [(s, newX[0, s])for s in newX.nonzero()[1]]
                 result = lsi[newX]             
                 similarities = index[result]
                 
                 for u, gamma in enumerate(self.gammas): 
                     self.gamma = gamma 
                     expertsByDocSimilarity, expertsByCitations = self.expertsFromDocSimilarities(similarities, len(self.trainExpertDict[field]), field)
                     
                     expertMatches = self.matchExperts(expertsByDocSimilarity, set(self.trainExpertDict[field]))
                     coverages[i, t, u, j] = float(len(expertMatches))/len(self.trainExpertDict[field])
             
             for u, gamma in enumerate(self.gammas):
                 logging.debug("Mean coverage for gamma=" + str(gamma) + " " + str(numpy.mean(coverages[i, t, u, :])))
         
     meanCoverges = numpy.mean(coverages, 3)
     logging.debug(meanCoverges)
     
     bestInds = numpy.unravel_index(numpy.argmax(meanCoverges), meanCoverges.shape)           
     
     self.k = self.ks[bestInds[0]]
     logging.debug("Chosen k=" + str(self.k))
     
     self.minDf = self.minDfs[bestInds[1]]
     logging.debug("Chosen minDf=" + str(self.minDf))
     
     self.gamma = self.gammas[bestInds[2]]
     logging.debug("Chosen gamma=" + str(self.gamma))   
     
     logging.debug("Coverage = " + str(numpy.max(meanCoverges)))
     
     return meanCoverges
     
     
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    model_path = path.join(base_path,
                           p['result_path'],
                           p['model_label'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # train the model on the small marketing corpus
    preprocess = []

    if 'stoplist' in p.as_dict():
        stoplist = open(path.join(base_path, p['stoplist'])).readlines()
        stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
        def remove_stopwords(sentence):
            return [word for word in sentence if not word in stoplist]
        preprocess.append(remove_stopwords)

    if 'stemmer' in p.as_dict():
        stemmer = Stemmer.Stemmer(p['stemmer'])
        preprocess.append(stemmer.stemWords)

    if not p['model_label']:
        cor = TextFilesCorpus(path.join(base_path, p['corpus_path']),
                              no_below=p['no_below'],
                              no_above=p['no_above'],
                              preprocess=preprocess)
        dictionary = cor.dictionary

        pre = LogEntropyModel(cor, id2word=dictionary, normalize=True)
        lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics'])
    else:
        dictionary = Dictionary.load(path.join(model_path, p['dict_name']))
        pre = SaveLoad.load(path.join(model_path, 'pre.model'))
        lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
        lsi.num_topics = p['num_topics']

    test_cor_path = path.join(base_path, p['test_cor_path'])
    test_answers, gold_answers, ratings = [], [], []


    flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt'))
    for file in flist:
        match = re.search('data3_(\d)_\d+.txt', file)
        ratings.append(int(match.group(1)))
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            test_answers.append(corpus)
    flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt'))
    for file in flist:
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            gold_answers.append(corpus)


    sim = MatrixSimilarity(test_answers)[gold_answers]
    mean_sim = np.mean(sim, axis=0)
    print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1]
    print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)