def modelSelectionLSI(self): """ Lets find the optimal parameters for LSI for all fields. We see the optimal number of parameters for the training set of experts. """ coverages = numpy.zeros((len(self.ks), len(self.minDfs), len(self.gammas), len(self.fields))) logging.getLogger('gensim').setLevel(logging.INFO) maxK = numpy.max(self.ks) logging.debug("Starting model selection for LSI") for t, minDf in enumerate(self.minDfs): logging.debug("Using minDf=" + str(minDf)) self.minDf = minDf self.vectoriseDocuments() self.loadVectoriser() corpus = gensim.corpora.mmcorpus.MmCorpus(self.docTermMatrixFilename + ".mtx") id2WordDict = dict(zip(range(len(self.vectoriser.get_feature_names())), self.vectoriser.get_feature_names())) logging.debug("Running LSI with " + str(maxK) + " dimensions") lsi = LsiModel(corpus, num_topics=maxK, id2word=id2WordDict, chunksize=self.chunksize, distributed=False, onepass=False) for i, k in enumerate(self.ks): lsi.num_topics = k logging.debug("Creating index") index = gensim.similarities.docsim.Similarity(self.indexFilename, lsi[corpus], num_features=k) for j, field in enumerate(self.fields): logging.debug("k="+str(k) + " and field=" + str(field)) newX = self.vectoriser.transform([field]) newX = [(s, newX[0, s])for s in newX.nonzero()[1]] result = lsi[newX] similarities = index[result] for u, gamma in enumerate(self.gammas): self.gamma = gamma expertsByDocSimilarity, expertsByCitations = self.expertsFromDocSimilarities(similarities, len(self.trainExpertDict[field]), field) expertMatches = self.matchExperts(expertsByDocSimilarity, set(self.trainExpertDict[field])) coverages[i, t, u, j] = float(len(expertMatches))/len(self.trainExpertDict[field]) for u, gamma in enumerate(self.gammas): logging.debug("Mean coverage for gamma=" + str(gamma) + " " + str(numpy.mean(coverages[i, t, u, :]))) meanCoverges = numpy.mean(coverages, 3) logging.debug(meanCoverges) bestInds = numpy.unravel_index(numpy.argmax(meanCoverges), meanCoverges.shape) self.k = self.ks[bestInds[0]] logging.debug("Chosen k=" + str(self.k)) self.minDf = self.minDfs[bestInds[1]] logging.debug("Chosen minDf=" + str(self.minDf)) self.gamma = self.gammas[bestInds[2]] logging.debug("Chosen gamma=" + str(self.gamma)) logging.debug("Coverage = " + str(numpy.max(meanCoverges))) return meanCoverges
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) model_path = path.join(base_path, p['result_path'], p['model_label']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # train the model on the small marketing corpus preprocess = [] if 'stoplist' in p.as_dict(): stoplist = open(path.join(base_path, p['stoplist'])).readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] def remove_stopwords(sentence): return [word for word in sentence if not word in stoplist] preprocess.append(remove_stopwords) if 'stemmer' in p.as_dict(): stemmer = Stemmer.Stemmer(p['stemmer']) preprocess.append(stemmer.stemWords) if not p['model_label']: cor = TextFilesCorpus(path.join(base_path, p['corpus_path']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) dictionary = cor.dictionary pre = LogEntropyModel(cor, id2word=dictionary, normalize=True) lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics']) else: dictionary = Dictionary.load(path.join(model_path, p['dict_name'])) pre = SaveLoad.load(path.join(model_path, 'pre.model')) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) lsi.num_topics = p['num_topics'] test_cor_path = path.join(base_path, p['test_cor_path']) test_answers, gold_answers, ratings = [], [], [] flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt')) for file in flist: match = re.search('data3_(\d)_\d+.txt', file) ratings.append(int(match.group(1))) with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] test_answers.append(corpus) flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt')) for file in flist: with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] gold_answers.append(corpus) sim = MatrixSimilarity(test_answers)[gold_answers] mean_sim = np.mean(sim, axis=0) print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1] print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)