Пример #1
0
from collections import defaultdict

from topicmod.util import flags
from topicmod.util.wordnet import load_wn
from topicmod.ling.dictionary import *
from topicmod.ling.snowball_wrapper import Snowball
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original vocab will be included
# in the generated wordnet, generate a new vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")
Пример #2
0
import sys
import os

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_int("num_iterations", 1, "Number of iterations")
flags.define_string("model_name", "output/model", "Where we find data")

flags.define_string("corpus", None, "The source corpus")

flags.define_bool("hadoop", False, "Do we use hadoop or local batch")
flags.define_bool("doc_step", True, "Do we call the document-centric parts")
flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)")
flags.define_bool("update_global", True, "Do we compute new transition and DP variational parameters")

class Array:
  def __init__(self, name):
    self._rows = {}
    self._name = name

  def __getitem__(self, index):
    if not index in self._rows:
      self._rows[index] = defaultdict(float)
    return self._rows[index]

  def __iter__(self):
    for ii in self._rows:
      yield self._rows[ii]

  def parse(self, key, val):
Пример #3
0
import sys
import os

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_int("num_iterations", 1, "Number of iterations")
flags.define_string("model_name", "output/model", "Where we find data")

flags.define_string("corpus", None, "The source corpus")

flags.define_bool("hadoop", False, "Do we use hadoop or local batch")
flags.define_bool("doc_step", True, "Do we call the document-centric parts")
flags.define_bool("merge_step", True,
                  "Do we merge doc step results (and compute new topics)")
flags.define_bool(
    "update_global", True,
    "Do we compute new transition and DP variational parameters")


class Array:
    def __init__(self, name):
        self._rows = {}
        self._name = name

    def __getitem__(self, index):
        if not index in self._rows:
            self._rows[index] = defaultdict(float)
        return self._rows[index]

    def __iter__(self):
    rank = tfidf
  else:
    rank = frequency

  o = codecs.open(outputname, 'w', 'utf-8')
  for ii in rank:
    count = 0
    for jj in rank[ii]:
      count += 1
      if count <= vocab_limit and frequency[ii][jj] >= freq_limit:
        word = vocab[ii][jj]
        o.write(u"%i\t%s\t%f\t%i\n" % (ii, word, tfidf[ii][jj], frequency[ii][jj]))
        
  o.close()


flags.define_string("proto_corpus", None, "The proto files")
flags.define_bool("lemma", False, "Use lemma or tokens")
flags.define_bool("select_tfidf", False, "select the vocab by tfidf or frequency")
flags.define_string("output", "", "Where we output the preprocessed data")
flags.define_string("vocab", None, "Where we output the vocab")
flags.define_int("vocab_limit", 10000, "The vocab size")
flags.define_int("freq_limit", 20, "The minimum frequency of each word")

if __name__ == "__main__":

  flags.InitFlags()  
  [vocab, tfidf, frequency] = gen_files(flags.proto_corpus, flags.output, flags.lemma)
  gen_vocab(vocab, tfidf, frequency, flags.select_tfidf, flags.vocab, flags.vocab_limit, flags.freq_limit)

Пример #5
0
from math import log
from random import random

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_string("vocab", None, "Size of vocabulary")
flags.define_int("num_docs", None, "Numer of documents")
flags.define_int("num_topics", 128, "Number topics")
flags.define_string("model_name", "output/model", "Name of model")

flags.define_bool("finite", False, "Use finite model")
flags.define_bool("ignore_trans", False, "Use only documents")
flags.define_bool("ignore_docs", False, "Use only syntax")
flags.define_bool("shortcut_gsl", False,
                  "Use closed form updates when possible")

flags.define_int("max_doc_iterations", 5,
                 "Number of e-step rounds per-document")
flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions")
flags.define_int("alpha_trans", 1.0,
                 "DP parameter for transition distributions")
flags.define_int("alpha_top", 1.0,
                 "DP parameter for top-level stick-breaking distribution")
flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx")

if __name__ == "__main__":
    flags.InitFlags()

    params = SyntopParameters()
Пример #6
0
#
# File to turn protocol buffers into a test-only input file readable by
# mapreduce implementation of syntactic topic model.

from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False,
                  "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None,
                    "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index",
                    "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
Пример #7
0
#
# File to turn protocol buffers into a test-only input file readable by
# mapreduce implementation of syntactic topic model.

from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False, "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None, "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
        self.use_lemma = use_lemma

        # A lookup for each language
Пример #8
0
if __name__ == "__main__":
    from topicmod.util import flags

    mkdir("/tmp/%s" % USER)
    mkdir("/tmp/%s/qsub-scripts" % USER)

    flags.define_string("template", "", "Where we read the template file from")
    flags.define_dict("args", {}, "Substitute values for the template")
    flags.define_dict("defaults", {}, "Default args")
    flags.define_string("wall", "24:00:00", "The wall time")
    flags.define_string("name", "", "Name given to job on cluster")
    flags.define_string("mem", "4gb", "How much memory we give")
    flags.define_string("queue", "shallow", "Which queue do we submit to")
    flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster")
    flags.define_bool("delete_scripts", True, "Do we delete after we're done?")
    flags.define_bool("submit", True, "Do we submit")

    flags.InitFlags()
    template = open(flags.template).read()
    d = flags.defaults

    d["wall"] = flags.wall
    d["mem"] = flags.mem
    for ii in flags.args:
        d[ii] = flags.args[ii]

    if flags.name:
        d["name"] = flags.name

    if not "name" in d:
Пример #9
0
from topicmod.util import flags
from topicmod.corpora.vocab_compiler import VocabCompiler

flags.define_glob("corpus_parts", None, "Where we look for vocab")
flags.define_filename("output", None, "Where we write the new vocab")
flags.define_int("min_freq", 10, "Minimum frequency for inclusion")
flags.define_int("vocab_limit", 5000, "Maximum vocab size")
flags.define_bool("exclude_stop", True, "Do we throw out stop words")
flags.define_bool("exclude_punc", True, "Do we exclude punctuation")
flags.define_bool("exclude_digits", True, "Do we exclude digits")
flags.define_list("special_stop", [], "Special stop words")
flags.define_int("min_length", 3, "Minimum length for tokens")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("bigram", False, "Use bigrams")

if __name__ == "__main__":
  flags.InitFlags()

  assert not (flags.stem and flags.bigram), "Can't use stem and bigram"

  v = VocabCompiler()
  for ii in flags.corpus_parts:
    print ii
    v.addVocab(ii, flags.exclude_stop, flags.special_stop, \
                 flags.exclude_punc, flags.exclude_digits, \
                 flags.stem, flags.bigram, flags.min_length)
  v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
Пример #10
0
from collections import defaultdict

from topicmod.util import flags
from topicmod.util.wordnet import load_wn
from topicmod.ling.dictionary import *
from topicmod.ling.snowball_wrapper import Snowball
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original
# vocab will be included  in the generated wordnet, generate a new
# vocab only contains the words in the wordnet.
Пример #11
0
if __name__ == "__main__":
    from topicmod.util import flags

    mkdir("/tmp/%s" % USER)
    mkdir("/tmp/%s/qsub-scripts" % USER)

    flags.define_string("template", "", "Where we read the template file from")
    flags.define_dict("args", {}, "Substitute values for the template")
    flags.define_dict("defaults", {}, "Default args")
    flags.define_string("wall", "24:00:00", "The wall time")
    flags.define_string("name", "", "Name given to job on cluster")
    flags.define_string("mem", "4gb", "How much memory we give")
    flags.define_string("queue", "shallow", "Which queue do we submit to")
    flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster")
    flags.define_bool("delete_scripts", True, "Do we delete after we're done?")
    flags.define_bool("submit", True, "Do we submit")

    flags.InitFlags()
    template = open(flags.template).read()
    d = flags.defaults

    d["wall"] = flags.wall
    d["mem"] = flags.mem
    for ii in flags.args:
        d[ii] = flags.args[ii]

    if flags.name:
        d["name"] = flags.name

    if not "name" in d:
from math import log
from random import random

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_string("vocab", None, "Size of vocabulary")
flags.define_int("num_docs", None, "Numer of documents")
flags.define_int("num_topics", 128, "Number topics")
flags.define_string("model_name", "output/model", "Name of model")

flags.define_bool("finite", False, "Use finite model")
flags.define_bool("ignore_trans", False, "Use only documents")
flags.define_bool("ignore_docs", False, "Use only syntax")
flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible")

flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document")
flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions")
flags.define_int("alpha_trans",  1.0, "DP parameter for transition distributions")
flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution")
flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx")

if __name__ == "__main__":
  flags.InitFlags()

  params = SyntopParameters()

  params.finite = flags.finite
  params.ignore_trans = flags.ignore_trans
  params.ignore_docs = flags.ignore_docs
  params.shortcut_gsl = flags.shortcut_gsl
Пример #13
0
      output_file.write(tmp)
      count += 1

  output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", "", "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_int("cannot_links", 0, "Number of cannot links that we want")
flags.define_int("must_links", 0, "Number of must links that we want")

flags.define_int("num_topics", 20, "Number of topics")
flags.define_bool("train_only", False, "Using only train data to \
                                        generate the constraints")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf")

if __name__ == "__main__":

  flags.InitFlags()

  # getting statistics: slower version, full statistics, memory cost
  #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
  #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
  #             flags.topics_cutoff, flags.window_size, flags.train_only)


  # getting statistics: faster version, partial statistics, memory efficient
  print "Reading vocab"
Пример #14
0
from collections import defaultdict
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
import re

flags.define_string("vocab", "vocab/semcor.voc", \
                      "the vocabulary used for building the tree")
flags.define_string("wnname", "wn/wordnet.wn", "Where we write output")
flags.define_string(
    "constraints", "",
    "where we get the constraints, " + "one tab-delimited constraint per line")

flags.define_bool("write_constraints", False, "Write out example constraint")
flags.define_bool("write_wordnet", False, "Write out wordnet")
flags.define_bool("write_toy", False, "Write out a toy wordnet")
flags.define_bool("merge_constraints", True,
                  "Put duplicate constraints into" + " a single constraint")


def orderedTraversal(wn, pos='n', limit_depth=-1, reverse_depth=False):
    """
    Given a wordnet object, give the synsets in order of internal nodes first,
    followed by leaves.

    @param pos Which part of speech we search
    @param limit_depth Don't consider nodes deeper than this
    @param reverse Reverse the order of the search (leaves first)
    """

    # Find the max depth synset
Пример #15
0
from collections import defaultdict
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
import re

flags.define_string("vocab", "vocab/semcor.voc", \
                      "the vocabulary used for building the tree")
flags.define_string("wnname", "wn/wordnet.wn", "Where we write output")
flags.define_string("constraints", "",
                    "where we get the constraints, " +
                    "one tab-delimited constraint per line")

flags.define_bool("write_constraints", False, "Write out example constraint")
flags.define_bool("write_wordnet", False, "Write out wordnet")
flags.define_bool("write_toy", False, "Write out a toy wordnet")
flags.define_bool("merge_constraints", True, "Put duplicate constraints into" +
                  " a single constraint")


def orderedTraversal(wn, pos='n', limit_depth=-1, reverse_depth=False):
    """
    Given a wordnet object, give the synsets in order of internal nodes first,
    followed by leaves.

    @param pos Which part of speech we search
    @param limit_depth Don't consider nodes deeper than this
    @param reverse Reverse the order of the search (leaves first)
    """

    # Find the max depth synset