Exemplo n.º 1
0
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original vocab will be included
# in the generated wordnet, generate a new vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")


def greedy_german_matching(filter_list, limit, stem):
    stemmer = Snowball()
    translations = defaultdict(set)

    for en, de in DingEntries("en", "de"):
Exemplo n.º 2
0
import numpy
from numpy import zeros
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta",
                    "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc",
                    "Where we write vocab")
flags.define_int("topic_output_size", 15,
                 "Number of words to display when we output topics")

ml_vocab = [{
    0: ["dog", "cat", "moose", "butterfly"],
    1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"],
    2: [
from numpy import *

from topicmod.external.moremath import *

from topicmod.util import flags

flags.define_string("alpha", None, "The current value of alpha")
flags.define_string("gamma", None, "The current gamma matrix")
flags.define_float("tolerance", 0.001, "Toleranceg for convergence")

NEGATIVE_INFINITY = -float("inf")


def l_alpha(alpha, M, K, gamma_grad):
    val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha)
    val *= M
    for ii in xrange(K):
        val += alpha[ii] * gamma_grad[ii]
    return val


def compute_gamma_gradient(gamma, K):
    """
  Compute the components of the derivative that gamma contributes to. 
  """

    grad = zeros(K)

    for gamma_d in gamma:
        digam_gamma_sum = digamma(sum(gamma_d))
        for ii in xrange(K):
Exemplo n.º 4
0
import numpy
from numpy import zeros
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab")
flags.define_int("topic_output_size", 15, "Number of words to display when we output topics")

ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 
 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 
 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]},
{0: ["monday", "tuesday", "thursday", "friday", "saturday"],
 1: ["montag", "dienstag", "mitwoch", "donnerstag", "freitag", "samstag", "sontag"], 
 2: ["lunes", "martes", "miercoles", "jueves", "viernes", "sabado", "domingo"]},
{0: ["mop", "broom", "bucket", "rake"],
Exemplo n.º 5
0
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original
# vocab will be included  in the generated wordnet, generate a new
# vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")


def greedy_german_matching(filter_list, limit, stem):
  stemmer = Snowball()
  translations = defaultdict(set)
Exemplo n.º 6
0
    tmp = 'MERGE_'
    for word in merge2:
        tmp += '\t' + word
    tmp += '\n'
    output_file.write(tmp)

    output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_float("tfidf_thresh", 0, "threshold for tfidf")

if __name__ == "__main__":

    flags.InitFlags()

    # getting statistics: slower version, full statistics, memory cost
    #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
    #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
    #             flags.topics_cutoff, flags.window_size, flags.train_only)

    # getting statistics: faster version, partial statistics, memory efficient
    print "Reading vocab"
    [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
    vocab_size = len(vocab_word_index)
Exemplo n.º 7
0
from topicmod.corpora.nyt_reader import *
from topicmod.util import flags

flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus")
flags.define_int("doc_limit", -1, "How many documents")
flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data")
flags.define_float("bigram_limit", 0.9, "p-value for bigrams")

if __name__ == "__main__":
    flags.InitFlags()
    nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit)
    nyt.add_language_list("../../data/new_york_times/editorial_file_list")

    nyt.write_proto(flags.output + "numeric", "nyt", 1000)
Exemplo n.º 8
0
import codecs

from collections import defaultdict

from topicmod.util import flags
from topicmod.ling.dictionary import DingEntries

flags.define_string("vocab", "", "Where we read vocab")
flags.define_float("smoothing", 0.001, "Smoothing amount")
flags.define_float("hit", 1.0, "Value if there's a hit")
flags.define_string("output", "lda/lambda", "Lambda output")

if __name__ == "__main__":
  flags.InitFlags()

  vocab = defaultdict(dict)
  index = defaultdict(int)

  for ii in codecs.open(flags.vocab):
    lang, word = ii.split("\t")
    lang = int(lang)
    vocab[lang][word.strip()] = index[lang]
    index[lang] += 1

  trans = defaultdict(set)
  sum = defaultdict(float)
  for ii in vocab[0]:
    for jj in vocab[1]:
      if ii == jj:
        if vocab[1][jj] % 100 == 0:
Exemplo n.º 9
0
  output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", "", "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_int("cannot_links", 0, "Number of cannot links that we want")
flags.define_int("must_links", 0, "Number of must links that we want")

flags.define_int("num_topics", 20, "Number of topics")
flags.define_bool("train_only", False, "Using only train data to \
                                        generate the constraints")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf")

if __name__ == "__main__":

  flags.InitFlags()

  # getting statistics: slower version, full statistics, memory cost
  #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
  #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
  #             flags.topics_cutoff, flags.window_size, flags.train_only)


  # getting statistics: faster version, partial statistics, memory efficient
  print "Reading vocab"
  [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
  vocab_size = len(vocab_word_index)
Exemplo n.º 10
0
from numpy import *

from topicmod.external.moremath import *

from topicmod.util import flags

flags.define_string("alpha", None, "The current value of alpha")
flags.define_string("gamma", None, "The current gamma matrix")
flags.define_float("tolerance", 0.001, "Toleranceg for convergence")

NEGATIVE_INFINITY = -float("inf")

def l_alpha(alpha, M, K, gamma_grad):
  val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha)
  val *= M
  for ii in xrange(K):
    val += alpha[ii] * gamma_grad[ii]
  return val

def compute_gamma_gradient(gamma, K):
  """
  Compute the components of the derivative that gamma contributes to. 
  """

  grad = zeros(K)

  for gamma_d in gamma:
    digam_gamma_sum = digamma(sum(gamma_d))
    for ii in xrange(K):
      grad[ii] += digamma(gamma_d[ii]) - digam_gamma_sum