def train(self, corpus): """Train Mallet LDA. Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format """ self.convert_input(corpus, infer=False) cmd = self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s '\ '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold ) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. # word_topics has replaced wordtopics throughout the code; # wordtopics just stores the values of word_topics when train is called. self.wordtopics = self.word_topics
def convert_input(self, corpus, infer=False): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that MALLET understands: one document per line: # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens with utils.smart_open(self.fcorpustxt(), 'wb') as fout: for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) fout.write( utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) # convert the text file above into MALLET's internal format cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s" % cmd) check_output(cmd, shell=True)
def convert_input(self, corpus, infer=False, serialize_corpus=True): """Convert corpus to Mallet format and save it to a temporary text file. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. infer : bool, optional ... serialize_corpus : bool, optional ... """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format cmd = \ self.mallet_path + \ " import-file --preserve-case --keep-sequence " \ "--remove-stopwords --token-regex \"\\S+\" --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s", cmd) check_output(args=cmd, shell=True)
def train(self, corpus): """Train Mallet LDA. Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format """ self.convert_input(corpus, infer=False) cmd = self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s '\ '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. # word_topics has replaced wordtopics throughout the code; # wordtopics just stores the values of word_topics when train is called. self.wordtopics = self.word_topics
def convert_input(self, corpus, infer=False, serialize_corpus=True): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format cmd = \ self.mallet_path + \ " import-file --preserve-case --keep-sequence " \ "--remove-stopwords --token-regex \"\S+\" --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s", cmd) check_output(args=cmd, shell=True)
def __getitem__(self, bow, iterations=100): """Get vector for document(s). Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document (or corpus) in BoW format. iterations : int, optional Number of iterations that will be used for inferring. Returns ------- list of (int, float) LDA vector for document as sequence of (topic_id, topic_probability) **OR** list of list of (int, float) LDA vectors for corpus in same format. """ is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = \ self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def convert_input(self, corpus, infer=False, serialize_corpus=True): """Convert corpus to Mallet format and save it to a temporary text file. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. infer : bool, optional ... serialize_corpus : bool, optional ... """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format cmd = \ self.mallet_path + \ " import-file --preserve-case --keep-sequence " \ "--remove-stopwords --token-regex \"\S+\" --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s", cmd) check_output(args=cmd, shell=True)
def convert_input(self, corpus, infer=False): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that MALLET understands: one document per line: # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens with utils.smart_open(self.fcorpustxt(), 'wb') as fout: for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) fout.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) # convert the text file above into MALLET's internal format cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s" % cmd) check_output(cmd, shell=True)
def train(self, corpus: Iterable[tuple[int, int]]): """Train STTM model. Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format """ self.convert_input(corpus) self.java_opts = '-Xmx1G' cmd = 'java {} -jar {} -model {} -corpus {} -ntopics {} -alpha {} -beta {} -niters {} -twords {} -name {} -sstep {}' cmd = cmd.format( self.java_opts, self.sstm_jar_path, self.model, self.text_corpus_filename(), self.num_topics, self.alpha[0], self.beta, self.iterations, self.twords, self.name, self.sstep, ) if self.vectors is not None: cmd += ' -vectors {}'.format(self.vectors) logger.info("training STTM model with %s", cmd) check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() self.wordtopics = self.word_topics
def __getitem__(self, bow, iterations=100): """Get vector for document(s). Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document (or corpus) in BoW format. iterations : int, optional Number of iterations that will be used for inferring. Returns ------- list of (int, float) LDA vector for document as sequence of (topic_id, topic_probability) **OR** list of list of (int, float) LDA vectors for corpus in same format. """ is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = \ self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold ) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def train(self, corpus, time_slices, mode, model): """ Train DTM model using specified corpus and time slices. """ self.convert_input(corpus, time_slices) arguments = "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} --outname={p4} --alpha={p5}".format( p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha) params = "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} --top_chain_var={p3} --rng_seed={p4} ".format( p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed) arguments = arguments + " " + params logger.info("training DTM with args %s" % arguments) cmd = [self.dtm_path] + arguments.split() logger.info("Running command %s" % cmd) check_output(cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) self.init_ss = np.loadtxt(self.flda_ss()) if self.initialize_lda: self.init_alpha = np.loadtxt(self.finit_alpha()) self.init_beta = np.loadtxt(self.finit_beta()) self.lhood_ = np.loadtxt(self.fout_liklihoods()) # document-topic proportions self.gamma_ = np.loadtxt(self.fout_gamma()) # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic # in doc 5 self.gamma_.shape = (self.lencorpus, self.num_topics) # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): topic = "%03d" % t self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': for k, t in enumerate(self.time_slices): stamp = "%03d" % k influence = np.loadtxt(self.fout_influence().format(i=stamp)) influence.shape = (t, self.num_topics) # influence[2,5] influence of document 2 on topic 5 self.influences_time.append(influence)
def train(self, corpus): self.convert_input(corpus, infer=False) cmd = self.mallet_path + " train-topics --input %s --num-topics %s --optimize-interval %s "\ "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\ "--num-iterations %s --inferencer-filename %s" cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer()) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s" % cmd) check_output(cmd, shell=True) self.word_topics = self.load_word_topics()
def train(self, corpus): self.convert_input(corpus, infer=False) cmd = self.mallet_path + " train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s "\ "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\ "--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s" cmd = cmd % ( self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(cmd, shell=True) self.word_topics = self.load_word_topics()
def dim(dtm_path, input_dir, output_dir, num_topics=40): print("Running DIM") command = "--mode=fit --rng_seed=0 --model=fixed --initialize_lda=true --corpus_prefix=example/test --outname=example/output --time_resolution=2 --influence_flat_years=5 --top_obs_var=0.5 --top_chain_var=0.005 --sigma_d=0.0001 --sigma_l=0.0001 --alpha=0.01 --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --save_time=-1 --ntopics=10 --lda_max_em_iter=10" command = command.split() command[4] = "--corpus_prefix=" + input_dir + "/dim" command[5] = "--outname=" + output_dir command[16] = "--ntopics=" + str(num_topics) command.insert(0, dtm_path) check_output(command) print("Done with DIM")
def __getitem__(self, bow, iterations=100): is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def testConversion(self): check_output(args=[ sys.executable, '-m', 'gensim.scripts.glove2word2vec', '--input', self.datapath, '--output', self.output_file ]) # test that the converted model loads successfully try: self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file) self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) except Exception: if os.path.isfile(os.path.join(self.output_file)): self.fail('model file %s was created but could not be loaded.' % self.output_file) else: self.fail( 'model file %s creation failed, check the parameters and input file format.' % self.output_file )
def convert_input(self, corpus, infer=False, serialize_corpus=True): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s' if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s", cmd) check_output(args=cmd, shell=True)
def train(self, corpus: Iterable[Iterable[Tuple[int, int]]], **kwargs): """Train Mallet LDA. Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format """ use_existing_corpus: bool = kwargs.get('use_existing_corpus', False) if os.path.isfile(self.mallet_corpus_filename()) and use_existing_corpus: logger.warning("using EXISTING corpus.mallet!") else: self.convert_input(corpus, infer=False) cmd: str = ( f"{self.mallet_path} train-topics " f"--input {self.mallet_corpus_filename()} " f"--num-topics {self.num_topics} " f"--alpha {self.alpha} " f"--optimize-interval {self.optimize_interval} " f"--num-threads {self.workers} " f"--output-state {self.mallet_state_filename()} " f"--output-doc-topics {self.document_topics_filename()} " f"--output-topic-keys {self.topic_keys_filename()} " f"--num-top-words {self.num_top_words} " f"--diagnostics-file {self.diagnostics_filename()} " f"--num-iterations {self.iterations} " f"--inferencer-filename {self.inferencer_filename()} " f"--doc-topics-threshold {self.topic_threshold} " f"--random-seed {str(self.random_seed)} " ) # f"--topic-word-weights-file {self.ftopicwordweights()} " logger.info(f"training MALLET LDA with {cmd}") check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() self.wordtopics = self.word_topics
def fasttext_fit(train_file_path, param_dict, fasttext_path, thread=1, compress_model=False, model_path='/dev/shm/model', pretrained_vectors_path=None): """ Trains a fastText supervised model. This is a wrapper around the fastText command line interface. :param train_file_path: path to the training dataset :param param_dict: dictionary mapping fasttext hyperparameters to their values :param fasttext_path: path to the fastText executable :param thread: int, the number of threads to use :param compress_model: indicates whether the fastText model should be compressed (using fastText's quantize). :param model_path: str, path to output model :param pretrained_vectors_path: str, path to pre-trained `.vec` file with word embeddings :return str: path to trained model """ train_call, compress_call = get_fasttext_train_calls( train_file_path, param_dict, fasttext_path, model_path, thread, pretrained_vectors_path=pretrained_vectors_path) utils.check_output(args=train_call, stderr=subprocess.DEVNULL) if compress_model: utils.check_output(args=compress_call, stderr=subprocess.DEVNULL) model_file = model_path + '.bin' # remove auxiliary vectors file os.remove(model_path + '.vec') # remove non-compressed model file if compression was performed if compress_model: os.remove(model_file) model_file = model_path + '.ftz' return model_file
def fasttext_predict(trained_model_path, test_file_path, fasttext_path, probability_file_path): """ Predicts class probabilities for a given dataset using a previously trained fastText model. :param trained_model_path: path to the trained fastText model :param test_file_path: path to the test dataset :param fasttext_path: path to the fastText executable :param probability_file_path: str, path to the output file with class probabilities for the test dataset; output written to this file will always be gzipped """ predict_call = get_fasttext_test_calls(test_file_path, fasttext_path, trained_model_path) predictions = utils.check_output(args=predict_call, stderr=subprocess.DEVNULL) with gzip.open(probability_file_path, 'wb') as fout: fout.write(predictions)
def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=91, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0): """ `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_path` is the path to directory which will be created to save embeddings and training data. `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates `iter` = number of iterations (epochs) over the corpus. `epsilon` is the power scaling value for weighting function. `dump_period` is the period after which embeddings should be dumped. `reg` is the value of regularization parameter. `alpha` is the alpha parameter of gamma distribution. `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ meta_data_path = 'matrix.meta' vocab_file = 'vocab.txt' temp_vocab_file = 'tempvocab.txt' cooccurrence_file = 'cooccurrence' cooccurrence_shuf_file = 'wiki.toy' meta_file = 'meta' # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_path) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) os.chdir(meta_dir) cmd_vocab_count = ['../../glove/vocab_count', '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)] cmd_cooccurence_count = ['../../glove/cooccur', '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)] cmd_shuffle_cooccurences = ['../../glove/shuffle', '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] logger.info("Prepare training data using glove code '%s'", commands) input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for line in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for line in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file, numwords, vocab_file) f.write(meta_info.encode('utf-8')) wr_args = { 'path': 'meta', 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } os.chdir('..') # run wordrank executable with wr_args cmd = ['mpirun', '-np', '1', '../wordrank'] for option, value in wr_args.items(): cmd.append("--%s" % option) cmd.append(str(value)) logger.info("Running wordrank binary '%s'", cmd) output = utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter / dump_period * dump_period - 1 copyfile('model_word_%d.txt' % max_iter_dump, 'wordrank.words') copyfile('model_context_%d.txt' % max_iter_dump, 'wordrank.contexts') model = cls.load_wordrank_model('wordrank.words', os.path.join('meta', vocab_file), 'wordrank.contexts', sorted_vocab, ensemble) os.chdir('../..') if cleanup_files: rmtree(model_dir) return model
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates `iter` = number of iterations (epochs) over the corpus. `epsilon` is the power scaling value for weighting function. `dump_period` is the period after which embeddings should be dumped. `reg` is the value of regularization parameter. `alpha` is the alpha parameter of gamma distribution. `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `np` number of copies to execute. (mpirun option) `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1] ) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter ) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble ) if cleanup_files: rmtree(model_dir) return model
def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): """ `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`. `corpus_file` is the filename of the text file to be used for training the FastText model. Expects file to contain utf-8 encoded text. `model` defines the training algorithm. By default, cbow is used. Accepted values are 'cbow', 'skipgram'. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate. `min_count` = ignore all words with total occurrences lower than this. `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax), `ns` (negative sampling) and `softmax`. Defaults to `ns` `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 1e-3, useful range is (0, 1e-5). `negative` = the value for negative specifies how many "noise words" should be drawn (usually between 5-20). Default is 5. If set to 0, no negative samping is used. Only relevant when `loss` is set to `ns` `iter` = number of iterations (epochs) over the corpus. Default is 5. `min_n` = min length of char ngrams to be used for training word representations. Default is 3. `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. Default is 6. `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `threads` = number of threads to use. Default is 12. """ ft_path = ft_path output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model') ft_args = { 'input': corpus_file, 'output': output_file, 'lr': alpha, 'dim': size, 'ws': window, 'epoch': iter, 'minCount': min_count, 'neg': negative, 'loss': loss, 'minn': min_n, 'maxn': max_n, 'thread': threads, 't': sample } cmd = [ft_path, model] for option, value in ft_args.items(): cmd.append("-%s" % option) cmd.append(str(value)) output = utils.check_output(args=cmd) model = cls.load_fasttext_format(output_file) cls.delete_training_files(output_file) return model
def train(self, corpus, time_slices, mode, model): """Train DTM model. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. """ self.convert_input(corpus, time_slices) arguments = \ "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \ "--outname={p4} --alpha={p5}".format( p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha ) params = \ "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \ "--top_chain_var={p3} --rng_seed={p4} ".format( p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed ) arguments = arguments + " " + params logger.info("training DTM with args %s", arguments) cmd = [self.dtm_path] + arguments.split() logger.info("Running command %s", cmd) check_output(args=cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) self.init_ss = np.loadtxt(self.flda_ss()) if self.initialize_lda: self.init_alpha = np.loadtxt(self.finit_alpha()) self.init_beta = np.loadtxt(self.finit_beta()) self.lhood_ = np.loadtxt(self.fout_liklihoods()) # document-topic proportions self.gamma_ = np.loadtxt(self.fout_gamma()) # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic # in doc 5 self.gamma_.shape = (self.lencorpus, self.num_topics) # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): topic = "%03d" % t self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': for k, t in enumerate(self.time_slices): stamp = "%03d" % k influence = np.loadtxt(self.fout_influence().format(i=stamp)) influence.shape = (t, self.num_topics) # influence[2,5] influence of document 2 on topic 5 self.influences_time.append(influence)
for el in id_lema: if len(el[0]): fout.write(el[0] + ' 0 ' + ' '.join(el[2]) + '\n') else: fout.write(el[1] + ' 0 ' + ' '.join(el[2]) + '\n') token_regexp=cf.get('CorpusGeneration','token_regexp') cmd = str(mallet_path) + \ ' import-file --preserve-case --keep-sequence ' + \ '--remove-stopwords --token-regex "' + token_regexp + '" ' + \ '--input %s --output %s' cmd = cmd % (corpus_file, corpus_mallet) try: print(f'-- -- Running command {cmd}') check_output(args=cmd, shell=True) except: print('-- -- Mallet failed to import data. Revise command') ############################################################# # Generate corpus with procedure data for BioProtocol ############################################################# corpus_dir = Path2corpus.joinpath('lemasWithProcedure') print('El corpus incluyendo procedimientos se guardará en el directorio', corpus_dir) corpus_dir.mkdir() id_lema = BIO_df[['ProtocolID', 'S2paperID', 'LEMAS', 'LEMASprocedures']].values.tolist() import_config = corpus_dir.joinpath('import.config') with import_config.open('w', encoding='utf8') as fout: fout.write('min_lemas = ' + str(min_lemas) + '\n')
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates `iter` = number of iterations (epochs) over the corpus. `epsilon` is the power scaling value for weighting function. `dump_period` is the period after which embeddings should be dumped. `reg` is the value of regularization parameter. `alpha` is the alpha parameter of gamma distribution. `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `np` number of copies to execute. (mpirun option) `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [ os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory) ] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [ cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences ] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [ temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file ] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1]) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, " "as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) " "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble) if cleanup_files: rmtree(model_dir) return model
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """Train model. Parameters ---------- wr_path : str Absolute path to the Wordrank directory. corpus_file : str Path to corpus file, expected space-separated tokens in a each line format. out_name : str Name of the directory which will be created (in wordrank folder) to save embeddings and training data: * ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period. * ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period. * ``meta/vocab.txt`` - vocab file. * ``meta/wiki.toy`` - word-word concurrence values. size : int, optional Dimensionality of the feature vectors. window : int, optional Number of context words to the left (and to the right, if `symmetric = 1`). symmetric : {0, 1}, optional If 1 - using symmetric windows, if 0 - will use only left context words. min_count : int, optional Ignore all words with total frequency lower than `min_count`. max_vocab_size : int, optional Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit. sgd_num : int, optional Number of SGD taken for each data point. lrate : float, optional Learning rate (attention: too high diverges, give Nan). period : int, optional Period of xi variable updates. iter : int, optional Number of iterations (epochs) over the corpus. epsilon : float, optional Power scaling value for weighting function. dump_period : int, optional Period after which embeddings should be dumped. reg : int, optional Value of regularization parameter. alpha : int, optional Alpha parameter of gamma distribution. beta : int, optional Beta parameter of gamma distribution. loss : {"logistic", "hinge"}, optional Name of the loss function. memory : float, optional Soft limit for memory consumption, in GB. np : int, optional Number of process to execute (mpirun option). cleanup_files : bool, optional If True, delete directory and files used by this wrapper. sorted_vocab : {0, 1}, optional If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. ensemble : {0, 1}, optional If 1 - use ensemble of word and context vectors. """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1] ) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, " "as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) " "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter ) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble ) if cleanup_files: rmtree(model_dir) return model
def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): """ `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`. `corpus_file` is the filename of the text file to be used for training the FastText model. Expects file to contain utf-8 encoded text. `model` defines the training algorithm. By default, cbow is used. Accepted values are 'cbow', 'skipgram'. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate. `min_count` = ignore all words with total occurrences lower than this. `word_ngram` = max length of word ngram `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax), `ns` (negative sampling) and `softmax`. Defaults to `ns` `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 1e-3, useful range is (0, 1e-5). `negative` = the value for negative specifies how many "noise words" should be drawn (usually between 5-20). Default is 5. If set to 0, no negative samping is used. Only relevant when `loss` is set to `ns` `iter` = number of iterations (epochs) over the corpus. Default is 5. `min_n` = min length of char ngrams to be used for training word representations. Default is 3. `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. Default is 6. `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `threads` = number of threads to use. Default is 12. """ ft_path = ft_path output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model') ft_args = { 'input': corpus_file, 'output': output_file, 'lr': alpha, 'dim': size, 'ws': window, 'epoch': iter, 'minCount': min_count, 'wordNgrams': word_ngrams, 'neg': negative, 'loss': loss, 'minn': min_n, 'maxn': max_n, 'thread': threads, 't': sample } cmd = [ft_path, model] for option, value in ft_args.items(): cmd.append("-%s" % option) cmd.append(str(value)) output = utils.check_output(args=cmd) model = cls.load_fasttext_format(output_file) cls.delete_training_files(output_file) return model
def train(self, corpus, time_slices, mode, model): """Train DTM model. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. """ self.convert_input(corpus, time_slices) arguments = \ "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \ "--outname={p4} --alpha={p5}".format( p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha ) params = \ "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \ "--top_chain_var={p3} --rng_seed={p4} ".format( p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed ) arguments = arguments + " " + params logger.info("training DTM with args %s", arguments) cmd = [self.dtm_path] + arguments.split() logger.info("Running command %s", cmd) check_output(args=cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) self.init_ss = np.loadtxt(self.flda_ss()) if self.initialize_lda: self.init_alpha = np.loadtxt(self.finit_alpha()) self.init_beta = np.loadtxt(self.finit_beta()) self.lhood_ = np.loadtxt(self.fout_liklihoods()) # document-topic proportions self.gamma_ = np.loadtxt(self.fout_gamma()) # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic # in doc 5 self.gamma_.shape = (self.lencorpus, self.num_topics) # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] self.lambda_ = np.zeros( (self.num_topics, self.num_terms * len(self.time_slices))) self.obs_ = np.zeros( (self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): topic = "%03d" % t self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) self.obs_[t, :] = np.loadtxt( self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': for k, t in enumerate(self.time_slices): stamp = "%03d" % k influence = np.loadtxt(self.fout_influence().format(i=stamp)) influence.shape = (t, self.num_topics) # influence[2,5] influence of document 2 on topic 5 self.influences_time.append(influence)
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """Train model. Parameters ---------- wr_path : str Absolute path to the Wordrank directory. corpus_file : str Path to corpus file, expected space-separated tokens in a each line format. out_name : str Name of the directory which will be created (in wordrank folder) to save embeddings and training data: * ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period. * ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period. * ``meta/vocab.txt`` - vocab file. * ``meta/wiki.toy`` - word-word concurrence values. size : int, optional Dimensionality of the feature vectors. window : int, optional Number of context words to the left (and to the right, if `symmetric = 1`). symmetric : {0, 1}, optional If 1 - using symmetric windows, if 0 - will use only left context words. min_count : int, optional Ignore all words with total frequency lower than `min_count`. max_vocab_size : int, optional Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit. sgd_num : int, optional Number of SGD taken for each data point. lrate : float, optional Learning rate (attention: too high diverges, give Nan). period : int, optional Period of xi variable updates. iter : int, optional Number of iterations (epochs) over the corpus. epsilon : float, optional Power scaling value for weighting function. dump_period : int, optional Period after which embeddings should be dumped. reg : int, optional Value of regularization parameter. alpha : int, optional Alpha parameter of gamma distribution. beta : int, optional Beta parameter of gamma distribution. loss : {"logistic", "hinge"}, optional Name of the loss function. memory : float, optional Soft limit for memory consumption, in GB. np : int, optional Number of process to execute (mpirun option). cleanup_files : bool, optional If True, delete directory and files used by this wrapper. sorted_vocab : {0, 1}, optional If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. ensemble : {0, 1}, optional If 1 - use ensemble of word and context vectors. """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [ os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory) ] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [ cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences ] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [ temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file ] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1]) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, " "as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) " "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble) if cleanup_files: rmtree(model_dir) return model
def train(self, corpus, **_): """Train Mallet LDA. Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format MALLET infer-topics --help TRUE A tool for estimating, saving and printing diagnostics for topic models, such as LDA. --help TRUE|FALSE Print this command line option usage information. Give argument of TRUE for longer documentation Default is false --prefix-code 'JAVA CODE' Java code you want run before any other interpreted code. Note that the text is interpreted without modification, so unlike some other Java code options, you need to include any necessary 'new's when creating objects. Default is null --config FILE Read command option values from a file Default is null --input FILENAME The filename from which to read the list of training instances. Use - for stdin. The instances must be FeatureSequenceor FeatureSequenceWithBigrams, not FeatureVector Default is null --input-model FILENAME The filename from which to read the binary topic model. The --input option is ignored. By default this is null, indicating that no file will be read. Default is null --input-state FILENAME The filename from which to read the gzipped Gibbs sampling state created by --output-state. The original input file must be included, using --input. By default this is null, indicating that no file will be read. Default is null --output-model FILENAME The filename in which to write the binary topic model at the end of the iterations. By default this is null,indicating that no file will be written. Default is null --output-state FILENAME The filename in which to write the Gibbs sampling state after at the end of the iterations. By default this is null, indicating that no file will be written. Default is null --output-model-interval INTEGER The number of iterations between writing the model (and its Gibbs sampling state) to a binary file. You must also set the --output-model to use this option, whose argument will be the prefix of the filenames. Default is 0 --output-state-interval INTEGER The number of iterations between writing the sampling state to a text file. You must also set the --output-state to use this option, whose argument will be the prefix of the filenames. Default is 0 --inferencer-filename FILENAME A topic inferencer applies a previously trained topic model to new documents. By default this is null, indicating that no file will be written. Default is null --evaluator-filename FILENAME A held-out likelihood evaluator for new documents. By default this is null, indicating that no file will be written. Default is null --output-topic-keys FILENAME The filename in which to write the top words for each topic and any Dirichlet parameters. By default this is null, indicating that no file will be written. Default is null --num-top-words INTEGER The number of most probable words to print for each topic after model estimation. Default is 20 --show-topics-interval INTEGER The number of iterations between printing a brief summary of the topics so far. Default is 50 --topic-word-weights-file FILENAME The filename in which to write unnormalized weights for every topic and word type. By default this is null, indicating that no file will be written. Default is null --word-topic-counts-file FILENAME The filename in which to write a sparse representation of topic-word assignments. By default this is null, indicating that no file will be written. Default is null --diagnostics-file FILENAME The filename in which to write measures of topic quality, in XML format. By default this is null, indicating that no file will be written. Default is null --xml-topic-report FILENAME The filename in which to write the top words for each topic and any Dirichlet parameters in XML format. By default this is null, indicating that no file will be written. Default is null --xml-topic-phrase-report FILENAME The filename in which to write the top words and phrases for each topic and any Dirichlet parameters in XML format. By default this is null, indicating that no file will be written. Default is null --output-topic-docs FILENAME The filename in which to write the most prominent documents for each topic, at the end of the iterations. By default this is null, indicating that no file will be written. Default is null --num-top-docs INTEGER When writing topic documents with --output-topic-docs, report this number of top documents. Default is 100 --output-doc-topics FILENAME The filename in which to write the topic proportions per document, at the end of the iterations. By default this is null, indicating that no file will be written. Default is null --doc-topics-threshold DECIMAL When writing topic proportions per document with --output-doc-topics, do not print topics with proportions less than this threshold value. Default is 0.0 --doc-topics-max INTEGER When writing topic proportions per document with --output-doc-topics, do not print more than INTEGER number of topics. A negative value indicates that all topics should be printed. Default is -1 --num-topics INTEGER The number of topics to fit. Default is 10 --num-threads INTEGER The number of threads for parallel training. Default is 1 --num-iterations INTEGER The number of iterations of Gibbs sampling. Default is 1000 --num-icm-iterations INTEGER The number of iterations of iterated conditional modes (topic maximization). Default is 0 --no-inference true|false Do not perform inference, just load a saved model and create a report. Equivalent to --num-iterations 0. Default is false --random-seed INTEGER The random seed for the Gibbs sampler. Default is 0, which will use the clock. Default is 0 --optimize-interval INTEGER The number of iterations between reestimating dirichlet hyperparameters. Default is 0 --optimize-burn-in INTEGER The number of iterations to run before first estimating dirichlet hyperparameters. Default is 200 --use-symmetric-alpha true|false Only optimize the concentration parameter of the prior over document-topic distributions. This may reduce the number of very small, poorly estimated topics, but may disperse common words over several topics. Default is false --alpha DECIMAL SumAlpha parameter: sum over topics of smoothing over doc-topic distributions. alpha_k = [this value] / [num topics] Default is 5.0 --beta DECIMAL Beta parameter: smoothing parameter for each topic-word. beta_w = [this value] Default is 0.01 """ self.convert_input(corpus, infer=False) cmd = ( self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s ' '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s --num-top-words %s ' '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s --random-seed %s' ) cmd = cmd % ( self.mallet_corpus_filename(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.mallet_state_filename(), self.document_topics_filename(), self.topic_keys_filename(), self.num_top_words, self.iterations, self.inferencer_filename(), self.topic_threshold, str(self.random_seed), ) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. # word_topics has replaced wordtopics throughout the code; # wordtopics just stores the values of word_topics when train is called. self.wordtopics = self.word_topics