from topicmod.util import flags from topicmod.corpora.vocab_compiler import VocabCompiler flags.define_glob("corpus_parts", None, "Where we look for vocab") flags.define_filename("output", None, "Where we write the new vocab") flags.define_int("min_freq", 10, "Minimum frequency for inclusion") flags.define_int("vocab_limit", 5000, "Maximum vocab size") flags.define_bool("exclude_stop", True, "Do we throw out stop words") flags.define_bool("exclude_punc", True, "Do we exclude punctuation") flags.define_bool("exclude_digits", True, "Do we exclude digits") flags.define_list("special_stop", [], "Special stop words") flags.define_int("min_length", 3, "Minimum length for tokens") flags.define_bool("stem", False, "Stem words") flags.define_bool("bigram", False, "Use bigrams") if __name__ == "__main__": flags.InitFlags() assert not (flags.stem and flags.bigram), "Can't use stem and bigram" v = VocabCompiler() for ii in flags.corpus_parts: print ii v.addVocab(ii, flags.exclude_stop, flags.special_stop, \ flags.exclude_punc, flags.exclude_digits, \ flags.stem, flags.bigram, flags.min_length) v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
from collections import defaultdict import codecs from xml.dom import minidom from glob import glob import nltk from nltk.corpus.reader.wordnet import WordNetError from topicmod.util.wordnet import load_wn from topicmod.util import flags from topicmod.util.sets import flatten flags.define_list("gn_valid_relations", ["hyperonym", "synonym", "near_synonym"], "What relationships we view as equivalent in GermaNet") kVALID_POS = ['n', 'v', 'a'] # TODO(JBG): Relationships aren't being read; this is just a # collection of synsets currently. def find_equiv(pos, word, offset, old_wn, new_wn): """ Given a pos, word, and offset in an old version of wordnet, finds the current version. Uses the sense key when it can, still tries to find unambiguous answers when it can't. """ word_matches = new_wn.synsets(word, pos)
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.amazon import AmazonCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_list("langs", ["en"], "Which langauges do we add") flags.define_string("base", "../../data/multiling-sent/", "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") LANGUAGE_CONSTANTS = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE, "fr": FRENCH, "es": SPANISH, "ar": ARABIC} if __name__ == "__main__": flags.InitFlags() corpus = AmazonCorpus(flags.base, flags.doc_limit) for ll in flags.langs: corpus.add_language("amzn-%s/*/*" % ll, LANGUAGE_CONSTANTS[ll]) corpus.write_proto(flags.output + "numeric", "amazon")
from topicmod.corpora.wacky import * from topicmod.util import flags flags.define_string("wackypedia_base", "../../data/wackypedia/compressed/", "Where we find the wackypedia corpus") flags.define_string("output", "/tmp/jbg/wackypedia/", "Where we write output") flags.define_int("doc_limit", 10, "Max number of docs") flags.define_list("langs", ["en"], "Which languages") if __name__ == "__main__": flags.InitFlags() wacky = WackyCorpus(flags.wackypedia_base, flags.doc_limit) for ii in flags.langs: wacky.add_language("wackypedia_%s*.gz" % ii) wacky.write_proto(flags.output + "numeric", "wpdia", 10000)
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.amazon import AmazonCorpus flags.define_int("doc_limit", -1, "How many documents we add") flags.define_list("langs", ["en"], "Which langauges do we add") flags.define_string("base", "../../data/multiling-sent/", \ "Where we look for data") flags.define_string("output", "/tmp/", "Where we write output") LANGUAGE_CONSTANTS = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE, \ "fr": FRENCH, "es": SPANISH, "ar": ARABIC} if __name__ == "__main__": flags.InitFlags() corpus = AmazonCorpus(flags.base, flags.doc_limit) for ll in flags.langs: corpus.add_language("amzn-%s/*/*" % ll, LANGUAGE_CONSTANTS[ll]) corpus.write_proto(flags.output + "numeric", "amazon")