Пример #1
0
from topicmod.util import flags
from topicmod.corpora.vocab_compiler import VocabCompiler

flags.define_glob("corpus_parts", None, "Where we look for vocab")
flags.define_filename("output", None, "Where we write the new vocab")
flags.define_int("min_freq", 10, "Minimum frequency for inclusion")
flags.define_int("vocab_limit", 5000, "Maximum vocab size")
flags.define_bool("exclude_stop", True, "Do we throw out stop words")
flags.define_bool("exclude_punc", True, "Do we exclude punctuation")
flags.define_bool("exclude_digits", True, "Do we exclude digits")
flags.define_list("special_stop", [], "Special stop words")
flags.define_int("min_length", 3, "Minimum length for tokens")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("bigram", False, "Use bigrams")

if __name__ == "__main__":
  flags.InitFlags()

  assert not (flags.stem and flags.bigram), "Can't use stem and bigram"

  v = VocabCompiler()
  for ii in flags.corpus_parts:
    print ii
    v.addVocab(ii, flags.exclude_stop, flags.special_stop, \
                 flags.exclude_punc, flags.exclude_digits, \
                 flags.stem, flags.bigram, flags.min_length)
  v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
Пример #2
0
from collections import defaultdict
import codecs
from xml.dom import minidom
from glob import glob

import nltk

from nltk.corpus.reader.wordnet import WordNetError

from topicmod.util.wordnet import load_wn
from topicmod.util import flags
from topicmod.util.sets import flatten

flags.define_list("gn_valid_relations",
                  ["hyperonym", "synonym", "near_synonym"],
                  "What relationships we view as equivalent in GermaNet")

kVALID_POS = ['n', 'v', 'a']

# TODO(JBG): Relationships aren't being read; this is just a
# collection of synsets currently.


def find_equiv(pos, word, offset, old_wn, new_wn):
    """
    Given a pos, word, and offset in an old version of wordnet, finds
    the current version.  Uses the sense key when it can, still tries to
    find unambiguous answers when it can't.
    """
    word_matches = new_wn.synsets(word, pos)
Пример #3
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.amazon import AmazonCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_list("langs", ["en"], "Which langauges do we add")
flags.define_string("base", "../../data/multiling-sent/", "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

LANGUAGE_CONSTANTS = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE, "fr": FRENCH, "es": SPANISH, "ar": ARABIC}

if __name__ == "__main__":
    flags.InitFlags()
    corpus = AmazonCorpus(flags.base, flags.doc_limit)
    for ll in flags.langs:
        corpus.add_language("amzn-%s/*/*" % ll, LANGUAGE_CONSTANTS[ll])

    corpus.write_proto(flags.output + "numeric", "amazon")
Пример #4
0
from topicmod.corpora.wacky import *
from topicmod.util import flags

flags.define_string("wackypedia_base", "../../data/wackypedia/compressed/",
                    "Where we find the wackypedia corpus")
flags.define_string("output", "/tmp/jbg/wackypedia/", "Where we write output")
flags.define_int("doc_limit", 10, "Max number of docs")
flags.define_list("langs", ["en"], "Which languages")

if __name__ == "__main__":
  flags.InitFlags()
  wacky = WackyCorpus(flags.wackypedia_base, flags.doc_limit)
  for ii in flags.langs:
    wacky.add_language("wackypedia_%s*.gz" % ii)

  wacky.write_proto(flags.output + "numeric",
                    "wpdia", 10000)
Пример #5
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.amazon import AmazonCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_list("langs", ["en"], "Which langauges do we add")
flags.define_string("base", "../../data/multiling-sent/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

LANGUAGE_CONSTANTS = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE, \
                        "fr": FRENCH, "es": SPANISH, "ar": ARABIC}

if __name__ == "__main__":
    flags.InitFlags()
    corpus = AmazonCorpus(flags.base, flags.doc_limit)
    for ll in flags.langs:
        corpus.add_language("amzn-%s/*/*" % ll, LANGUAGE_CONSTANTS[ll])

    corpus.write_proto(flags.output + "numeric", "amazon")