from collections import defaultdict from topicmod.util import flags from topicmod.util.wordnet import load_wn from topicmod.ling.dictionary import * from topicmod.ling.snowball_wrapper import Snowball from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original vocab will be included # in the generated wordnet, generate a new vocab only contains the words in the wordnet. flags.define_string("updated_vocab", "", "generate a new vocab")
import sys import os from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_int("num_iterations", 1, "Number of iterations") flags.define_string("model_name", "output/model", "Where we find data") flags.define_string("corpus", None, "The source corpus") flags.define_bool("hadoop", False, "Do we use hadoop or local batch") flags.define_bool("doc_step", True, "Do we call the document-centric parts") flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)") flags.define_bool("update_global", True, "Do we compute new transition and DP variational parameters") class Array: def __init__(self, name): self._rows = {} self._name = name def __getitem__(self, index): if not index in self._rows: self._rows[index] = defaultdict(float) return self._rows[index] def __iter__(self): for ii in self._rows: yield self._rows[ii] def parse(self, key, val):
import sys import os from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_int("num_iterations", 1, "Number of iterations") flags.define_string("model_name", "output/model", "Where we find data") flags.define_string("corpus", None, "The source corpus") flags.define_bool("hadoop", False, "Do we use hadoop or local batch") flags.define_bool("doc_step", True, "Do we call the document-centric parts") flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)") flags.define_bool( "update_global", True, "Do we compute new transition and DP variational parameters") class Array: def __init__(self, name): self._rows = {} self._name = name def __getitem__(self, index): if not index in self._rows: self._rows[index] = defaultdict(float) return self._rows[index] def __iter__(self):
rank = tfidf else: rank = frequency o = codecs.open(outputname, 'w', 'utf-8') for ii in rank: count = 0 for jj in rank[ii]: count += 1 if count <= vocab_limit and frequency[ii][jj] >= freq_limit: word = vocab[ii][jj] o.write(u"%i\t%s\t%f\t%i\n" % (ii, word, tfidf[ii][jj], frequency[ii][jj])) o.close() flags.define_string("proto_corpus", None, "The proto files") flags.define_bool("lemma", False, "Use lemma or tokens") flags.define_bool("select_tfidf", False, "select the vocab by tfidf or frequency") flags.define_string("output", "", "Where we output the preprocessed data") flags.define_string("vocab", None, "Where we output the vocab") flags.define_int("vocab_limit", 10000, "The vocab size") flags.define_int("freq_limit", 20, "The minimum frequency of each word") if __name__ == "__main__": flags.InitFlags() [vocab, tfidf, frequency] = gen_files(flags.proto_corpus, flags.output, flags.lemma) gen_vocab(vocab, tfidf, frequency, flags.select_tfidf, flags.vocab, flags.vocab_limit, flags.freq_limit)
from math import log from random import random from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_string("vocab", None, "Size of vocabulary") flags.define_int("num_docs", None, "Numer of documents") flags.define_int("num_topics", 128, "Number topics") flags.define_string("model_name", "output/model", "Name of model") flags.define_bool("finite", False, "Use finite model") flags.define_bool("ignore_trans", False, "Use only documents") flags.define_bool("ignore_docs", False, "Use only syntax") flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible") flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document") flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions") flags.define_int("alpha_trans", 1.0, "DP parameter for transition distributions") flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution") flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx") if __name__ == "__main__": flags.InitFlags() params = SyntopParameters()
# # File to turn protocol buffers into a test-only input file readable by # mapreduce implementation of syntactic topic model. from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = []
# # File to turn protocol buffers into a test-only input file readable by # mapreduce implementation of syntactic topic model. from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = [] self.use_lemma = use_lemma # A lookup for each language
if __name__ == "__main__": from topicmod.util import flags mkdir("/tmp/%s" % USER) mkdir("/tmp/%s/qsub-scripts" % USER) flags.define_string("template", "", "Where we read the template file from") flags.define_dict("args", {}, "Substitute values for the template") flags.define_dict("defaults", {}, "Default args") flags.define_string("wall", "24:00:00", "The wall time") flags.define_string("name", "", "Name given to job on cluster") flags.define_string("mem", "4gb", "How much memory we give") flags.define_string("queue", "shallow", "Which queue do we submit to") flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster") flags.define_bool("delete_scripts", True, "Do we delete after we're done?") flags.define_bool("submit", True, "Do we submit") flags.InitFlags() template = open(flags.template).read() d = flags.defaults d["wall"] = flags.wall d["mem"] = flags.mem for ii in flags.args: d[ii] = flags.args[ii] if flags.name: d["name"] = flags.name if not "name" in d:
from topicmod.util import flags from topicmod.corpora.vocab_compiler import VocabCompiler flags.define_glob("corpus_parts", None, "Where we look for vocab") flags.define_filename("output", None, "Where we write the new vocab") flags.define_int("min_freq", 10, "Minimum frequency for inclusion") flags.define_int("vocab_limit", 5000, "Maximum vocab size") flags.define_bool("exclude_stop", True, "Do we throw out stop words") flags.define_bool("exclude_punc", True, "Do we exclude punctuation") flags.define_bool("exclude_digits", True, "Do we exclude digits") flags.define_list("special_stop", [], "Special stop words") flags.define_int("min_length", 3, "Minimum length for tokens") flags.define_bool("stem", False, "Stem words") flags.define_bool("bigram", False, "Use bigrams") if __name__ == "__main__": flags.InitFlags() assert not (flags.stem and flags.bigram), "Can't use stem and bigram" v = VocabCompiler() for ii in flags.corpus_parts: print ii v.addVocab(ii, flags.exclude_stop, flags.special_stop, \ flags.exclude_punc, flags.exclude_digits, \ flags.stem, flags.bigram, flags.min_length) v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
from collections import defaultdict from topicmod.util import flags from topicmod.util.wordnet import load_wn from topicmod.ling.dictionary import * from topicmod.ling.snowball_wrapper import Snowball from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original # vocab will be included in the generated wordnet, generate a new # vocab only contains the words in the wordnet.
from math import log from random import random from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_string("vocab", None, "Size of vocabulary") flags.define_int("num_docs", None, "Numer of documents") flags.define_int("num_topics", 128, "Number topics") flags.define_string("model_name", "output/model", "Name of model") flags.define_bool("finite", False, "Use finite model") flags.define_bool("ignore_trans", False, "Use only documents") flags.define_bool("ignore_docs", False, "Use only syntax") flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible") flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document") flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions") flags.define_int("alpha_trans", 1.0, "DP parameter for transition distributions") flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution") flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx") if __name__ == "__main__": flags.InitFlags() params = SyntopParameters() params.finite = flags.finite params.ignore_trans = flags.ignore_trans params.ignore_docs = flags.ignore_docs params.shortcut_gsl = flags.shortcut_gsl
output_file.write(tmp) count += 1 output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", "", "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_int("cannot_links", 0, "Number of cannot links that we want") flags.define_int("must_links", 0, "Number of must links that we want") flags.define_int("num_topics", 20, "Number of topics") flags.define_bool("train_only", False, "Using only train data to \ generate the constraints") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \ # = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \ # flags.topics_cutoff, flags.window_size, flags.train_only) # getting statistics: faster version, partial statistics, memory efficient print "Reading vocab"
from collections import defaultdict from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags import re flags.define_string("vocab", "vocab/semcor.voc", \ "the vocabulary used for building the tree") flags.define_string("wnname", "wn/wordnet.wn", "Where we write output") flags.define_string( "constraints", "", "where we get the constraints, " + "one tab-delimited constraint per line") flags.define_bool("write_constraints", False, "Write out example constraint") flags.define_bool("write_wordnet", False, "Write out wordnet") flags.define_bool("write_toy", False, "Write out a toy wordnet") flags.define_bool("merge_constraints", True, "Put duplicate constraints into" + " a single constraint") def orderedTraversal(wn, pos='n', limit_depth=-1, reverse_depth=False): """ Given a wordnet object, give the synsets in order of internal nodes first, followed by leaves. @param pos Which part of speech we search @param limit_depth Don't consider nodes deeper than this @param reverse Reverse the order of the search (leaves first) """ # Find the max depth synset
from collections import defaultdict from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags import re flags.define_string("vocab", "vocab/semcor.voc", \ "the vocabulary used for building the tree") flags.define_string("wnname", "wn/wordnet.wn", "Where we write output") flags.define_string("constraints", "", "where we get the constraints, " + "one tab-delimited constraint per line") flags.define_bool("write_constraints", False, "Write out example constraint") flags.define_bool("write_wordnet", False, "Write out wordnet") flags.define_bool("write_toy", False, "Write out a toy wordnet") flags.define_bool("merge_constraints", True, "Put duplicate constraints into" + " a single constraint") def orderedTraversal(wn, pos='n', limit_depth=-1, reverse_depth=False): """ Given a wordnet object, give the synsets in order of internal nodes first, followed by leaves. @param pos Which part of speech we search @param limit_depth Don't consider nodes deeper than this @param reverse Reverse the order of the search (leaves first) """ # Find the max depth synset