else: topic_assignments_out.write(topic_assign) path_assignments_out.write(path_assign) topic_assignments_in.close() topic_assignments_out.close() path_assignments_in.close() path_assignments_out.close() docs_in.close() docs_out.close() return new_topics + 1 flags.define_string("corpus", None, "Where we find the input corpora") flags.define_string("mapping", None, "Filename of mapping") flags.define_string("cons_file", "", "Constraints filename") flags.define_glob("wordnet", "wn/output.0", "contraint source") flags.define_string("input_base", "output/nih", "Input filename") flags.define_string("output_base", "output/nih_ned", "Output filename") flags.define_string("resume_type", "clear", "resume type: clear or split") flags.define_string("update_strategy", "doc", "update strategy: term or doc") flags.define_int("doc_limit", -1, "Number of documents to process") flags.define_int("num_topics", 0, "Current number of topics") if __name__ == "__main__": flags.InitFlags() if re.search("doc", flags.update_strategy): update_strategy = 1
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.amazon import AmazonCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_list("langs", ["en"], "Which langauges do we add") flags.define_string("base", "../../data/multiling-sent/", \ "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") LANGUAGE_CONSTANTS = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE, \ "fr": FRENCH, "es": SPANISH, "ar": ARABIC} if __name__ == "__main__": flags.InitFlags() corpus = AmazonCorpus(flags.base, flags.doc_limit) for ll in flags.langs: corpus.add_language("amzn-%s/*/*" % ll, LANGUAGE_CONSTANTS[ll]) corpus.write_proto(flags.output + "numeric", "amazon")
from collections import defaultdict from topicmod.util import flags from topicmod.util.wordnet import load_wn from topicmod.ling.dictionary import * from topicmod.ling.snowball_wrapper import Snowball from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original # vocab will be included in the generated wordnet, generate a new # vocab only contains the words in the wordnet.
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.flat import FlatCorpus # from topicmod.corpora.flat import FlatEmailCorpus flags.define_int( "doc_limit", -1, "How many documents \ we add", ) flags.define_string("base", "../../data/yn_toy/", "Where we look for data") flags.define_string("output", "../../data/yn_toy/numeric", "Where we write output") if __name__ == "__main__": flags.InitFlags() corpus = FlatCorpus(flags.base, flags.doc_limit) corpus.add_language("*", ENGLISH) print flags.output corpus.write_proto(flags.output, "yn_toy")
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.flat import FlatCorpus #from topicmod.corpora.flat import FlatEmailCorpus flags.define_int("doc_limit", -1, "How many documents \ we add") flags.define_string("base", "../../data/yn_toy/", \ "Where we look for data") flags.define_string("output", "../../data/yn_toy/numeric", \ "Where we write output") if __name__ == "__main__": flags.InitFlags() corpus = FlatCorpus(flags.base, flags.doc_limit) corpus.add_language("*", ENGLISH) print flags.output corpus.write_proto(flags.output, "yn_toy")
pmi_score /= len(word_pairs) tmp = str(tt) + "\t" + str( len(word_pairs)) + "\t" + str(pmi_score) + "\n" infile.write(tmp) total_pmi_score += pmi_score total_pmi_score /= len(topics.keys()) tmp = "total" + "\t" + str(len( topics.keys())) + "\t" + str(total_pmi_score) + "\n" infile.write(tmp) infile.close() flags.define_string("vocab", "", "Where we find the vocab") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("stats", None, "Where we find the stat_file") flags.define_int("topics_cutoff", 30, "Number of topics") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "output/PMI_score", "PMI Output filename") if __name__ == "__main__": flags.InitFlags() print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index) print "Reading topic words"
from topicmod.corpora.semcor import SemcorCorpus from topicmod.util import flags flags.define_string("semcor_base", "../../data/semcor-%s/", \ "Where we find the semcor corpus") flags.define_string("wordnet_base", "../../data/wordnet/", \ "Where we find the wordnet corpus") flags.define_string("version", "3.0", "Version of WordNet used") flags.define_string("semcor_output", None, "Where we write the output") if __name__ == "__main__": flags.InitFlags() semcor = SemcorCorpus(flags.semcor_base % flags.version) semcor.load_wn(flags.wordnet_base, flags.version) semcor.add_language("brown1/tagfiles/*") semcor.add_language("brown2/tagfiles/*") #semcor.add_language("brownv/tagfiles/br-e*") semcor.write_proto(flags.semcor_output, "semcor", 80)
from topicmod.util import flags import re from numpy import zeros flags.define_string("folder_base", "output/20_news/", \ "Input file folder") flags.define_string("output_base", "output/20_news/results_compare", \ "Output file name") if __name__ == "__main__": flags.InitFlags() # comparing 1 filename = flags.output_base + 'results_compare_1.csv' outputfile = open(filename, 'w') folders = open(flags.folder_base + 'folders.txt', 'r') for folder in folders: folder = folder.strip() tmp = folder + '\n\n' outputfile.write(tmp) filename = folder.replace('results_', 'iter_100_') filename = flags.folder_base + folder + '/' + filename + '.txt' inputfile = open(filename, 'r') for line in inputfile: outputfile.write(line) inputfile.close() outputfile.write('\n\n\n')
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.europarl import EuroparlCorpus flags.define_string("base", "../../data/europarl/", "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") flags.define_int("doc_limit", -1, "How many documents we add") if __name__ == "__main__": flags.InitFlags() for ii in xrange(96, 107): year = ii % 100 print " *** YEAR %i *** " % year corpus = EuroparlCorpus(flags.base, flags.doc_limit) corpus.add_language("english/ep-%02i-*.en" % year, ENGLISH) corpus.add_language("german/ep-%02i-*.de" % year, GERMAN) corpus.write_proto(flags.output + "numeric", "europarl%02i" % year, 1000)
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags # from topicmod.corpora.flat import FlatCorpus from topicmod.corpora.crossfire import CrossfireCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_string("base", "../../data/crossfire/cf/clean/", "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") if __name__ == "__main__": flags.InitFlags() # corpus = FlatCorpus(flags.base, flags.doc_limit) corpus = CrossfireCorpus(flags.base, flags.doc_limit) corpus.add_language("*.txt") corpus.write_proto(flags.output + "numeric", "crossfire")
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.pang_lee_movie import PangLeeMovieCorpus flags.define_string("base", "../../data/movies/", "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") flags.define_string("response", "rating", "Which rating format we use") flags.define_int("doc_limit", -1, "How many documents we add") if __name__ == "__main__": flags.InitFlags() corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit) corpus.add_language("pang_lee/*/subj.*", flags.response, ENGLISH) corpus.write_proto(flags.output + "numeric", "movies", 1000) corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit) corpus.add_language("filmrezension.de_lines/*/subj.*", flags.response, \ GERMAN) corpus.write_proto(flags.output + "numeric", "movies", 100)
print path2 common = set(path1).intersection(set(path2)) first = min(common) assert(first >= len(word_list)) first -= len(word_list) cluster_root = Z[first][0] merge1 = findCluster(Z, cluster_root, word_list) cluster_root = Z[first][1] merge2 = findCluster(Z, cluster_root, word_list) print merge1 print merge2 flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", None, "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("constraint", "constraints/tmp", "Original constraint file") flags.define_int("topics_cutoff", 30, "Number of topic words") if __name__ == "__main__": # test() flags.InitFlags() # getting statistics: faster version, partial statistics, memory efficient print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index)
from numpy import * from topicmod.external.moremath import * from topicmod.util import flags flags.define_string("alpha", None, "The current value of alpha") flags.define_string("gamma", None, "The current gamma matrix") flags.define_float("tolerance", 0.001, "Toleranceg for convergence") NEGATIVE_INFINITY = -float("inf") def l_alpha(alpha, M, K, gamma_grad): val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha) val *= M for ii in xrange(K): val += alpha[ii] * gamma_grad[ii] return val def compute_gamma_gradient(gamma, K): """ Compute the components of the derivative that gamma contributes to. """ grad = zeros(K) for gamma_d in gamma: digam_gamma_sum = digamma(sum(gamma_d)) for ii in xrange(K): grad[ii] += digamma(gamma_d[ii]) - digam_gamma_sum
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags #from topicmod.corpora.flat import FlatCorpus from topicmod.corpora.flat import FlatCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_string("base", "../../data/values_turk/", \ "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") if __name__ == "__main__": flags.InitFlags() #corpus = FlatCorpus(flags.base, flags.doc_limit) corpus = FlatCorpus(flags.base, flags.doc_limit) corpus.add_language("1/*") corpus.add_language("2/*") corpus.write_proto(flags.output + "numeric", "values_turk")
from numpy import * from topicmod.external.moremath import * from topicmod.util import flags flags.define_string("alpha", None, "The current value of alpha") flags.define_string("gamma", None, "The current gamma matrix") flags.define_float("tolerance", 0.001, "Toleranceg for convergence") NEGATIVE_INFINITY = -float("inf") def l_alpha(alpha, M, K, gamma_grad): val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha) val *= M for ii in xrange(K): val += alpha[ii] * gamma_grad[ii] return val def compute_gamma_gradient(gamma, K): """ Compute the components of the derivative that gamma contributes to. """ grad = zeros(K) for gamma_d in gamma: digam_gamma_sum = digamma(sum(gamma_d)) for ii in xrange(K):
from collections import defaultdict from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags import re flags.define_string("vocab", "vocab/semcor.voc", \ "the vocabulary used for building the tree") flags.define_string("wnname", "wn/wordnet.wn", "Where we write output") flags.define_string( "constraints", "", "where we get the constraints, " + "one tab-delimited constraint per line") flags.define_bool("write_constraints", False, "Write out example constraint") flags.define_bool("write_wordnet", False, "Write out wordnet") flags.define_bool("write_toy", False, "Write out a toy wordnet") flags.define_bool("merge_constraints", True, "Put duplicate constraints into" + " a single constraint") def orderedTraversal(wn, pos='n', limit_depth=-1, reverse_depth=False): """ Given a wordnet object, give the synsets in order of internal nodes first, followed by leaves. @param pos Which part of speech we search @param limit_depth Don't consider nodes deeper than this @param reverse Reverse the order of the search (leaves first) """ # Find the max depth synset
from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = [] self.use_lemma = use_lemma # A lookup for each language self.vocab = defaultdict(dict) self.roles = defaultdict(dict) self.output_corpus = Corpus()
from topicmod.corpora.proto.corpus_pb2 import Corpus from topicmod.util import flags flags.define_glob("corpus_parts", None, "Where we look for vocab") flags.define_string("output", None, "Where we write the mapping") if __name__ == "__main__": flags.InitFlags() mapping = {} for ii in flags.corpus_parts: print ii cp = Corpus() cp.ParseFromString(open(ii, 'r').read()) for ii in cp.authors.terms: if ii.id in mapping: assert mapping[ii.id] == ii.original mapping[ii.id] = ii.original o = open(flags.output, 'w') for ii in xrange(max(mapping)): o.write("%s\n" % mapping[ii])
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.pang_lee_movie import PangLeeMovieCorpus flags.define_string("base", "../../data/rdd/moviestyledata/", "Where we look for data") flags.define_string("output", "../../data/rdd/moviestyleproto/numeric/", "Where we write output") flags.define_string("response", "rating", "Which rating format we use") flags.define_int("doc_limit", -1, "How many documents we add") if __name__ == "__main__": flags.InitFlags() corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit) corpus.add_language("*/subj.*", flags.response, DIXIE) corpus.write_proto(flags.output + "numeric", "richmond", 1000) #corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit) #corpus.add_language("filmrezension.de_lines/*/subj.*", flags.response, \ # GERMAN) #corpus.write_proto(flags.output + "numeric", "richmond", 100)
from topicmod.corpora.wacky import * from topicmod.util import flags flags.define_string("wackypedia_base", "../../data/wackypedia/compressed/", "Where we find the wackypedia corpus") flags.define_string("output", "/tmp/jbg/wackypedia/", "Where we write output") flags.define_int("doc_limit", 10, "Max number of docs") flags.define_list("langs", ["en"], "Which languages") if __name__ == "__main__": flags.InitFlags() wacky = WackyCorpus(flags.wackypedia_base, flags.doc_limit) for ii in flags.langs: wacky.add_language("wackypedia_%s*.gz" % ii) wacky.write_proto(flags.output + "numeric", "wpdia", 10000)
from topicmod.util import flags flags.define_string("input_base", "output/20_news/iter_100_PMI_", \ "Input file folder") flags.define_string("output_base", "output/20_news/iter_100_PMI", \ "Output file name") flags.define_string("PMI_file", "PMI_score", \ "Output file name") flags.define_int("round_num", "5", "Number of iteractive rounds") if __name__ == "__main__": flags.InitFlags() results = dict() rounds = flags.round_num + 1 for ii in range(0, rounds): filename = flags.input_base + str(ii) + "/" + flags.PMI_file inputfile = open(filename, 'r') for line in inputfile: line = line.strip() words = line.split('\t') if words[0].find('total') >= 0: word_key = -1 else: word_key = int(words[0]) if word_key not in results.keys(): results[word_key] = [] results[word_key].append(words[2]) outputfile = open(flags.output_base, 'w') for tt in results.keys():
word_senses_count[word] = 0 count_word += 1 tmp = word for pos in multipaths[word]: tmp += '\t' + pos for index in multipaths[word][pos]: word_senses_count[word] += 1 count_sense += 1 tmp += '\t' + str(index) if word_senses_count[word] > 1: im_words += word + " " outfile.write(tmp + '\n') outfile.write("\nThe total number of cons words: " + str(count_word) + "\n") outfile.write("\nThe total number of cons words senses: " + str(count_sense) + "\n") outfile.write("\nInteresting words: " + im_words + "\n") outfile.close() flags.define_string("vocab", None, "The input vocab") flags.define_string("output", None, "The output constraint file") flags.define_int("num_cons", 0, "The number of constraints we want") if __name__ == "__main__": flags.InitFlags() wordnet_path = "../../../data/wordnet/" eng_wn = load_wn("3.0", wordnet_path, "wn") vocab = readVocab(flags.vocab) generateCons(vocab, eng_wn, flags.output, flags.num_cons)
import sys import os from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_int("num_iterations", 1, "Number of iterations") flags.define_string("model_name", "output/model", "Where we find data") flags.define_string("corpus", None, "The source corpus") flags.define_bool("hadoop", False, "Do we use hadoop or local batch") flags.define_bool("doc_step", True, "Do we call the document-centric parts") flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)") flags.define_bool("update_global", True, "Do we compute new transition and DP variational parameters") class Array: def __init__(self, name): self._rows = {} self._name = name def __getitem__(self, index): if not index in self._rows: self._rows[index] = defaultdict(float) return self._rows[index] def __iter__(self): for ii in self._rows: yield self._rows[ii] def parse(self, key, val):
from topicmod.util import flags from topicmod.util.sets import count_line from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.ml_vocab import Vocab from collections import defaultdict import codecs flags.define_string("output", "", "Where we write output") flags.define_glob("doc_roots", "", "The document vocab") flags.define_string("vocab", "", "The vocab file") flags.define_string("location", "", "Where the data live") flags.define_int("min_length", 50, "Minimum number of tokens") flags.define_int("num_docs", -1, "Number of documents we write") flags.define_string("language", "en", "What language this is") kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE} def lda_line(filename, full_vocab, filtered_vocab): d = defaultdict(int) doc = Document() doc.ParseFromString(open(filename, 'rb').read()) num_words = 0 for sent in doc.sentences: for word in sent.words:
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags #from topicmod.corpora.flat import FlatCorpus from topicmod.corpora.flat import FlatEmailCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_string("base", "../../data/20_news_date/", \ "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") if __name__ == "__main__": flags.InitFlags() #corpus = FlatCorpus(flags.base, flags.doc_limit) corpus = FlatEmailCorpus(flags.base, flags.doc_limit) corpus.add_language("train/*/*") corpus.add_language("test/*/*") corpus.write_proto(flags.output + "numeric", "20_news_date")
pmi_score += pmi pmi_score /= len(word_pairs) tmp = str(tt) + "\t" + str(len(word_pairs)) + "\t" + str(pmi_score) + "\n" infile.write(tmp) total_pmi_score += pmi_score total_pmi_score /= len(topics.keys()) tmp = "total" + "\t" + str(len(topics.keys())) + "\t" + str(total_pmi_score) + "\n" infile.write(tmp) infile.close() flags.define_string("vocab", "", "Where we find the vocab") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("stats", None, "Where we find the stat_file") flags.define_int("topics_cutoff", 30, "Number of topics") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "output/PMI_score", "PMI Output filename") if __name__ == "__main__": flags.InitFlags() print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index) print "Reading topic words"
from numpy.random.mtrand import dirichlet from numpy.random import multinomial from numpy.random import normal from math import isnan, isinf from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("num_docs", 500, "Number of documents") flags.define_int("num_topics", 5, "Number of topics") flags.define_int("doc_length", 5, "Length of every document") flags.define_int("num_langs", 2, "Number of languages") flags.define_float("variance", 0.5, "Variance of distribution") flags.define_float("gamma", 1.0, "Vocabulary hyperparameter") flags.define_float("alpha", 0.1, "Document topic hyperparameter") flags.define_string("output_base", "data/synthetic", "Where we write the data") flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas") flags.define_int("num_groups", 2, "Number of splits") flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab") flags.define_int("topic_output_size", 15, "Number of words to display when we output topics") ml_vocab = [{ 0: ["dog", "cat", "moose", "butterfly"], 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 2: [ "toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo" ]
tmp = 'SPLIT_\t' + w1 + '\t' + w2 + '\n' output_file.write(tmp) count += 1 count = 0 for (w1, w2) in must.keys(): if count < must_links_num: pmi = must[(w1, w2)] tmp = 'MERGE_\t' + w1 + '\t' + w2 + '\n' output_file.write(tmp) count += 1 output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", "", "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_int("cannot_links", 0, "Number of cannot links that we want") flags.define_int("must_links", 0, "Number of must links that we want") flags.define_int("num_topics", 20, "Number of topics") flags.define_bool("train_only", False, "Using only train data to \ generate the constraints") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf") if __name__ == "__main__":
from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = [] self.use_lemma = use_lemma # A lookup for each language self.vocab = defaultdict(dict) self.roles = defaultdict(dict) self.output_corpus = Corpus()
import codecs from collections import defaultdict from topicmod.util import flags from topicmod.ling.dictionary import DingEntries flags.define_string("vocab", "", "Where we read vocab") flags.define_float("smoothing", 0.001, "Smoothing amount") flags.define_float("hit", 1.0, "Value if there's a hit") flags.define_string("output", "lda/lambda", "Lambda output") if __name__ == "__main__": flags.InitFlags() vocab = defaultdict(dict) index = defaultdict(int) for ii in codecs.open(flags.vocab): lang, word = ii.split("\t") lang = int(lang) vocab[lang][word.strip()] = index[lang] index[lang] += 1 trans = defaultdict(set) sum = defaultdict(float) for ii in vocab[0]: for jj in vocab[1]: if ii == jj: if vocab[1][jj] % 100 == 0:
print "Submitting ...", filename, qsub(filename, max_jobs) if delete: print "deleted" os.remove(filename) print(filename) print "" if __name__ == "__main__": from topicmod.util import flags mkdir("/tmp/%s" % USER) mkdir("/tmp/%s/qsub-scripts" % USER) flags.define_string("template", "", "Where we read the template file from") flags.define_dict("args", {}, "Substitute values for the template") flags.define_dict("defaults", {}, "Default args") flags.define_string("wall", "24:00:00", "The wall time") flags.define_string("name", "", "Name given to job on cluster") flags.define_string("mem", "4gb", "How much memory we give") flags.define_string("queue", "shallow", "Which queue do we submit to") flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster") flags.define_bool("delete_scripts", True, "Do we delete after we're done?") flags.define_bool("submit", True, "Do we submit") flags.InitFlags() template = open(flags.template).read() d = flags.defaults d["wall"] = flags.wall
from math import log from random import random from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_string("vocab", None, "Size of vocabulary") flags.define_int("num_docs", None, "Numer of documents") flags.define_int("num_topics", 128, "Number topics") flags.define_string("model_name", "output/model", "Name of model") flags.define_bool("finite", False, "Use finite model") flags.define_bool("ignore_trans", False, "Use only documents") flags.define_bool("ignore_docs", False, "Use only syntax") flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible") flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document") flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions") flags.define_int("alpha_trans", 1.0, "DP parameter for transition distributions") flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution") flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx") if __name__ == "__main__": flags.InitFlags() params = SyntopParameters()
from math import log from random import random from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_string("vocab", None, "Size of vocabulary") flags.define_int("num_docs", None, "Numer of documents") flags.define_int("num_topics", 128, "Number topics") flags.define_string("model_name", "output/model", "Name of model") flags.define_bool("finite", False, "Use finite model") flags.define_bool("ignore_trans", False, "Use only documents") flags.define_bool("ignore_docs", False, "Use only syntax") flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible") flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document") flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions") flags.define_int("alpha_trans", 1.0, "DP parameter for transition distributions") flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution") flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx") if __name__ == "__main__": flags.InitFlags() params = SyntopParameters() params.finite = flags.finite params.ignore_trans = flags.ignore_trans params.ignore_docs = flags.ignore_docs params.shortcut_gsl = flags.shortcut_gsl
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags #from topicmod.corpora.flat import FlatCorpus from topicmod.corpora.crossfire import CrossfireCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_string("base", "../../data/crossfire/cf/clean/", \ "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") if __name__ == "__main__": flags.InitFlags() #corpus = FlatCorpus(flags.base, flags.doc_limit) corpus = CrossfireCorpus(flags.base, flags.doc_limit) corpus.add_language("*.txt") corpus.write_proto(flags.output + "numeric", "crossfire")
from topicmod.util import flags from topicmod.util.sets import count_line from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.ml_vocab import Vocab from collections import defaultdict import codecs flags.define_string("output", "", "Where we write output") flags.define_glob("doc_roots", "", "The document vocab") flags.define_string("vocab", "", "The vocab file") flags.define_string("location", "", "Where the data live") flags.define_int("min_length", 50, "Minimum number of tokens") flags.define_int("num_docs", -1, "Number of documents we write") flags.define_string("language", "en", "What language this is") kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE} def lda_line(filename, full_vocab, filtered_vocab): d = defaultdict(int) doc = Document() doc.ParseFromString(open(filename, 'rb').read()) num_words = 0 for sent in doc.sentences: for word in sent.words: new_word = full_vocab.get_word(doc.language, word.token)
import sys import os from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_int("num_iterations", 1, "Number of iterations") flags.define_string("model_name", "output/model", "Where we find data") flags.define_string("corpus", None, "The source corpus") flags.define_bool("hadoop", False, "Do we use hadoop or local batch") flags.define_bool("doc_step", True, "Do we call the document-centric parts") flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)") flags.define_bool( "update_global", True, "Do we compute new transition and DP variational parameters") class Array: def __init__(self, name): self._rows = {} self._name = name def __getitem__(self, index): if not index in self._rows: self._rows[index] = defaultdict(float) return self._rows[index] def __iter__(self):
import os import gzip from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags # Input flags.define_string("doc_filter", None, "Files to filter out") flags.define_string("vocab", None, "The file that defines the vocab") flags.define_string("state_file", None, \ "The state file that create the corpus") # Output flags.define_string("state_output", None, "Where we write state file") flags.define_string("corpus_output_path", None, "Where we write the corpus") flags.define_string("corpus_name", "NIH", "Name of the corpus") # Options flags.define_int("docs_per_index", 5000, "Number of docs per section") flags.define_int("doc_limit", -1, "Cap on number of documents") class MalletAssignment: def __init__(self, line, debug=False): if debug: for ii in xrange(len(line.split())): print ii, line.split()[ii] self.doc, foo, self.index, self.term_id, self.term, self.assignment = \ line.split() self.doc = int(self.doc)
rank = tfidf else: rank = frequency o = codecs.open(outputname, 'w', 'utf-8') for ii in rank: count = 0 for jj in rank[ii]: count += 1 if count <= vocab_limit and frequency[ii][jj] >= freq_limit: word = vocab[ii][jj] o.write(u"%i\t%s\t%f\t%i\n" % (ii, word, tfidf[ii][jj], frequency[ii][jj])) o.close() flags.define_string("proto_corpus", None, "The proto files") flags.define_bool("lemma", False, "Use lemma or tokens") flags.define_bool("select_tfidf", False, "select the vocab by tfidf or frequency") flags.define_string("output", "", "Where we output the preprocessed data") flags.define_string("vocab", None, "Where we output the vocab") flags.define_int("vocab_limit", 10000, "The vocab size") flags.define_int("freq_limit", 20, "The minimum frequency of each word") if __name__ == "__main__": flags.InitFlags() [vocab, tfidf, frequency] = gen_files(flags.proto_corpus, flags.output, flags.lemma) gen_vocab(vocab, tfidf, frequency, flags.select_tfidf, flags.vocab, flags.vocab_limit, flags.freq_limit)
from topicmod.corpora.nyt_reader import * from topicmod.util import flags flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus") flags.define_int("doc_limit", -1, "How many documents") flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data") flags.define_float("bigram_limit", 0.9, "p-value for bigrams") if __name__ == "__main__": flags.InitFlags() nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit) nyt.add_language_list("../../data/new_york_times/editorial_file_list") nyt.write_proto(flags.output + "numeric", "nyt", 1000)
import re import os.path from proto.corpus_pb2 import * from proto.wordnet_file_pb2 import * from topicmod.util import flags from topicmod.util.sets import read_pickle, write_pickle flags.define_int("option", 0, \ "change the whole documents or just the topics of just the word") flags.define_string("ldawnoutput", "output/nsf", "ldawn output directory") flags.define_string("maps", "output/nsf", "mapping files directory") flags.define_string("wordnet", "wn/output.0", "contraint source") flags.define_string("assignment_path", None, "Where the assignments live") def checkSame(cons, old_cons): if len(cons) != len(old_cons): return False for key in cons: if key not in old_cons: return False return True def getMappingDicts_reGen(corpusdir, mapsdir, cons): # check the old constraint.dict exists or not cons_file = corpusdir + "/constraint.set" if (not os.path.exists(cons_file)): # Regenerate (word_wid_dic, wid_did_dic, did_doc_dic) = \ getNewMappingDicts(corpusdir, mapsdir) else:
qsub(filename, max_jobs) if delete: print "deleted" os.remove(filename) print(filename) print "" if __name__ == "__main__": from topicmod.util import flags mkdir("/tmp/%s" % USER) mkdir("/tmp/%s/qsub-scripts" % USER) flags.define_string("template", "", "Where we read the template file from") flags.define_dict("args", {}, "Substitute values for the template") flags.define_dict("defaults", {}, "Default args") flags.define_string("wall", "24:00:00", "The wall time") flags.define_string("name", "", "Name given to job on cluster") flags.define_string("mem", "4gb", "How much memory we give") flags.define_string("queue", "shallow", "Which queue do we submit to") flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster") flags.define_bool("delete_scripts", True, "Do we delete after we're done?") flags.define_bool("submit", True, "Do we submit") flags.InitFlags() template = open(flags.template).read() d = flags.defaults d["wall"] = flags.wall
tmp = 'MERGE_' for word in merge1: tmp += '\t' + word tmp += '\n' output_file.write(tmp) tmp = 'MERGE_' for word in merge2: tmp += '\t' + word tmp += '\n' output_file.write(tmp) output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", None, "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_float("tfidf_thresh", 0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \ # = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \ # flags.topics_cutoff, flags.window_size, flags.train_only)
from collections import defaultdict from topicmod.util import flags from topicmod.util.wordnet import load_wn from topicmod.ling.dictionary import * from topicmod.ling.snowball_wrapper import Snowball from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original vocab will be included # in the generated wordnet, generate a new vocab only contains the words in the wordnet. flags.define_string("updated_vocab", "", "generate a new vocab")
from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import Vocab from topicmod.corpora.ml_vocab import MultilingualVocab import codecs from collections import defaultdict flags.define_glob("wordnet", "", "The wordnet files") flags.define_string("vocab", "", "The vocab file") flags.define_glob("docs", "", "The documents we want to view") flags.define_glob("doc_roots", "", "The document vocab") def print_doc(filename, full, flat, wn): doc = Document() doc.ParseFromString(open(filename, 'rb').read()) print "-------------------------------" print "Original document:" for sent in doc.sentences: for word in sent.words: print "|%i:%i:%s|" % (doc.language, word.token, \ full.get_word(doc.language, \ word.token).encode("ascii", \ 'ignore')), print ""
infile = open(infilename, 'r') vocab = defaultdict(FreqDist) for line in infile: line = line.strip() ww = line.split('\t') lang = ww[0] if source[lang][ww[1]] == 0: print source[lang][ww[1]], ww[1] vocab[lang].inc(ww[1], source[lang][ww[1]]) infile.close() outfile = codecs.open(outfilename, 'w', 'utf-8') for ii in vocab: for jj in vocab[ii]: outfile.write(u"%s\t%s\n" % (ii, jj)) #outfile.write(u"%s\t%s\t%f\t%i\n" % (ii, jj, tfidf[ii][jj], frequency[ii][jj])) outfile.close() flags.define_string("stats_vocab", None, "The proto files") flags.define_string("input_vocab", None, "Where we get the original vocab") flags.define_int("option", 0, "1: tfidf; others: frequency") flags.define_string("sorted_vocab", None, "Where we output the vocab") if __name__ == "__main__": flags.InitFlags() [tfidf, frequency] = readStats(flags.stats_vocab) sortVocab(flags.input_vocab, tfidf, frequency, flags.option, flags.sorted_vocab)
from numpy.random.mtrand import dirichlet from numpy.random import multinomial from numpy.random import normal from math import isnan, isinf from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("num_docs", 500, "Number of documents") flags.define_int("num_topics", 5, "Number of topics") flags.define_int("doc_length", 5, "Length of every document") flags.define_int("num_langs", 2, "Number of languages") flags.define_float("variance", 0.5, "Variance of distribution") flags.define_float("gamma", 1.0, "Vocabulary hyperparameter") flags.define_float("alpha", 0.1, "Document topic hyperparameter") flags.define_string("output_base", "data/synthetic", "Where we write the data") flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas") flags.define_int("num_groups", 2, "Number of splits") flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab") flags.define_int("topic_output_size", 15, "Number of words to display when we output topics") ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]}, {0: ["monday", "tuesday", "thursday", "friday", "saturday"], 1: ["montag", "dienstag", "mitwoch", "donnerstag", "freitag", "samstag", "sontag"], 2: ["lunes", "martes", "miercoles", "jueves", "viernes", "sabado", "domingo"]}, {0: ["mop", "broom", "bucket", "rake"], 1: ["mopp", "besen", "eimer", "moebelpolitur"], 2: ["trapeador", "escoba", "cubeta", "rastrillo"]}, {0: ["north", "east", "south", "west"],
import os import gzip from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags # Input flags.define_string("doc_filter", None, "Files to filter out") flags.define_string("vocab", None, "The file that defines the vocab") flags.define_string("state_file", None, \ "The state file that create the corpus") # Output flags.define_string("state_output", None, "Where we write state file") flags.define_string("corpus_output_path", None, "Where we write the corpus") flags.define_string("corpus_name", "NIH", "Name of the corpus") # Options flags.define_int("docs_per_index", 5000, "Number of docs per section") flags.define_int("doc_limit", -1, "Cap on number of documents") class MalletAssignment: def __init__(self, line, debug=False): if debug: for ii in xrange(len(line.split())): print ii, line.split()[ii] self.doc, foo, self.index, self.term_id, self.term, self.assignment = \ line.split() self.doc = int(self.doc) self.index = int(self.index)
tmp = word + "\t" + str(self._wordcount[word]) + "\n" outfile.write(tmp) outfile.close() # write coccurance: outputfile = self._output_dir + "/cooccurance.txt" outfile = open(outputfile, 'w') for w1 in self._cooccur.keys(): for w2 in self._cooccur[w1].keys(): if self._cooccur[w1][w2] != 0: tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n" outfile.write(tmp) outfile.close() flags.define_string("corpus", None, "Where we find the input corpora") flags.define_string("proto_corpus", None, "Where we find the input proto corpora") flags.define_string("vocab", "", "The model files folder of topic models") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "PMI_stat/20_news", "PMI stat output filename") flags.define_int("option", "2", "0: 20_news; 1: wikipedia") if __name__ == "__main__": flags.InitFlags() # {0: 'english', 1: 'german'} lang = 0 cp = corpusParser(lang, flags.vocab, flags.corpus, flags.window_size, flags.output) if flags.option == 0: cp.parseCorpus20news() get_tfidf(flags.proto_corpus, flags.vocab, flags.output) elif flags.option == 1: