from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original vocab will be included # in the generated wordnet, generate a new vocab only contains the words in the wordnet. flags.define_string("updated_vocab", "", "generate a new vocab") def greedy_german_matching(filter_list, limit, stem): stemmer = Snowball() translations = defaultdict(set) for en, de in DingEntries("en", "de"):
import numpy from numpy import zeros from numpy.random.mtrand import dirichlet from numpy.random import multinomial from numpy.random import normal from math import isnan, isinf from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("num_docs", 500, "Number of documents") flags.define_int("num_topics", 5, "Number of topics") flags.define_int("doc_length", 5, "Length of every document") flags.define_int("num_langs", 2, "Number of languages") flags.define_float("variance", 0.5, "Variance of distribution") flags.define_float("gamma", 1.0, "Vocabulary hyperparameter") flags.define_float("alpha", 0.1, "Document topic hyperparameter") flags.define_string("output_base", "data/synthetic", "Where we write the data") flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas") flags.define_int("num_groups", 2, "Number of splits") flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab") flags.define_int("topic_output_size", 15, "Number of words to display when we output topics") ml_vocab = [{ 0: ["dog", "cat", "moose", "butterfly"], 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 2: [
from numpy import * from topicmod.external.moremath import * from topicmod.util import flags flags.define_string("alpha", None, "The current value of alpha") flags.define_string("gamma", None, "The current gamma matrix") flags.define_float("tolerance", 0.001, "Toleranceg for convergence") NEGATIVE_INFINITY = -float("inf") def l_alpha(alpha, M, K, gamma_grad): val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha) val *= M for ii in xrange(K): val += alpha[ii] * gamma_grad[ii] return val def compute_gamma_gradient(gamma, K): """ Compute the components of the derivative that gamma contributes to. """ grad = zeros(K) for gamma_d in gamma: digam_gamma_sum = digamma(sum(gamma_d)) for ii in xrange(K):
import numpy from numpy import zeros from numpy.random.mtrand import dirichlet from numpy.random import multinomial from numpy.random import normal from math import isnan, isinf from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("num_docs", 500, "Number of documents") flags.define_int("num_topics", 5, "Number of topics") flags.define_int("doc_length", 5, "Length of every document") flags.define_int("num_langs", 2, "Number of languages") flags.define_float("variance", 0.5, "Variance of distribution") flags.define_float("gamma", 1.0, "Vocabulary hyperparameter") flags.define_float("alpha", 0.1, "Document topic hyperparameter") flags.define_string("output_base", "data/synthetic", "Where we write the data") flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas") flags.define_int("num_groups", 2, "Number of splits") flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab") flags.define_int("topic_output_size", 15, "Number of words to display when we output topics") ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]}, {0: ["monday", "tuesday", "thursday", "friday", "saturday"], 1: ["montag", "dienstag", "mitwoch", "donnerstag", "freitag", "samstag", "sontag"], 2: ["lunes", "martes", "miercoles", "jueves", "viernes", "sabado", "domingo"]}, {0: ["mop", "broom", "bucket", "rake"],
from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original # vocab will be included in the generated wordnet, generate a new # vocab only contains the words in the wordnet. flags.define_string("updated_vocab", "", "generate a new vocab") def greedy_german_matching(filter_list, limit, stem): stemmer = Snowball() translations = defaultdict(set)
tmp = 'MERGE_' for word in merge2: tmp += '\t' + word tmp += '\n' output_file.write(tmp) output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", None, "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_float("tfidf_thresh", 0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \ # = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \ # flags.topics_cutoff, flags.window_size, flags.train_only) # getting statistics: faster version, partial statistics, memory efficient print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index)
from topicmod.corpora.nyt_reader import * from topicmod.util import flags flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus") flags.define_int("doc_limit", -1, "How many documents") flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data") flags.define_float("bigram_limit", 0.9, "p-value for bigrams") if __name__ == "__main__": flags.InitFlags() nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit) nyt.add_language_list("../../data/new_york_times/editorial_file_list") nyt.write_proto(flags.output + "numeric", "nyt", 1000)
import codecs from collections import defaultdict from topicmod.util import flags from topicmod.ling.dictionary import DingEntries flags.define_string("vocab", "", "Where we read vocab") flags.define_float("smoothing", 0.001, "Smoothing amount") flags.define_float("hit", 1.0, "Value if there's a hit") flags.define_string("output", "lda/lambda", "Lambda output") if __name__ == "__main__": flags.InitFlags() vocab = defaultdict(dict) index = defaultdict(int) for ii in codecs.open(flags.vocab): lang, word = ii.split("\t") lang = int(lang) vocab[lang][word.strip()] = index[lang] index[lang] += 1 trans = defaultdict(set) sum = defaultdict(float) for ii in vocab[0]: for jj in vocab[1]: if ii == jj: if vocab[1][jj] % 100 == 0:
output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", "", "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_int("cannot_links", 0, "Number of cannot links that we want") flags.define_int("must_links", 0, "Number of must links that we want") flags.define_int("num_topics", 20, "Number of topics") flags.define_bool("train_only", False, "Using only train data to \ generate the constraints") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \ # = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \ # flags.topics_cutoff, flags.window_size, flags.train_only) # getting statistics: faster version, partial statistics, memory efficient print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index)
from numpy import * from topicmod.external.moremath import * from topicmod.util import flags flags.define_string("alpha", None, "The current value of alpha") flags.define_string("gamma", None, "The current gamma matrix") flags.define_float("tolerance", 0.001, "Toleranceg for convergence") NEGATIVE_INFINITY = -float("inf") def l_alpha(alpha, M, K, gamma_grad): val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha) val *= M for ii in xrange(K): val += alpha[ii] * gamma_grad[ii] return val def compute_gamma_gradient(gamma, K): """ Compute the components of the derivative that gamma contributes to. """ grad = zeros(K) for gamma_d in gamma: digam_gamma_sum = digamma(sum(gamma_d)) for ii in xrange(K): grad[ii] += digamma(gamma_d[ii]) - digam_gamma_sum