Exemplos de define_float em Python, exemplos de topicmod.util.flags.define_float em Python

Exemplo n.º 1

0

Exibir arquivo

from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original vocab will be included
# in the generated wordnet, generate a new vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")


def greedy_german_matching(filter_list, limit, stem):
    stemmer = Snowball()
    translations = defaultdict(set)

    for en, de in DingEntries("en", "de"):

Exemplo n.º 2

0

Exibir arquivo

import numpy
from numpy import zeros
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta",
                    "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc",
                    "Where we write vocab")
flags.define_int("topic_output_size", 15,
                 "Number of words to display when we output topics")

ml_vocab = [{
    0: ["dog", "cat", "moose", "butterfly"],
    1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"],
    2: [

Exemplo n.º 3

0

Exibir arquivo

Arquivo: hyperparameter_optimization.py Projeto: hxsebastien/topicmod

from numpy import *

from topicmod.external.moremath import *

from topicmod.util import flags

flags.define_string("alpha", None, "The current value of alpha")
flags.define_string("gamma", None, "The current gamma matrix")
flags.define_float("tolerance", 0.001, "Toleranceg for convergence")

NEGATIVE_INFINITY = -float("inf")


def l_alpha(alpha, M, K, gamma_grad):
    val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha)
    val *= M
    for ii in xrange(K):
        val += alpha[ii] * gamma_grad[ii]
    return val


def compute_gamma_gradient(gamma, K):
    """
  Compute the components of the derivative that gamma contributes to. 
  """

    grad = zeros(K)

    for gamma_d in gamma:
        digam_gamma_sum = digamma(sum(gamma_d))
        for ii in xrange(K):

Exemplo n.º 4

0

Exibir arquivo

Arquivo: synthetic_data.py Projeto: NetBUG/topicmod

import numpy
from numpy import zeros
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab")
flags.define_int("topic_output_size", 15, "Number of words to display when we output topics")

ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 
 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 
 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]},
{0: ["monday", "tuesday", "thursday", "friday", "saturday"],
 1: ["montag", "dienstag", "mitwoch", "donnerstag", "freitag", "samstag", "sontag"], 
 2: ["lunes", "martes", "miercoles", "jueves", "viernes", "sabado", "domingo"]},
{0: ["mop", "broom", "bucket", "rake"],

Exemplo n.º 5

0

Exibir arquivo

Arquivo: multilingual_ontologies.py Projeto: Pinafore/topicmod

from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original
# vocab will be included  in the generated wordnet, generate a new
# vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")


def greedy_german_matching(filter_list, limit, stem):
  stemmer = Snowball()
  translations = defaultdict(set)

Exemplo n.º 6

0

Exibir arquivo

    tmp = 'MERGE_'
    for word in merge2:
        tmp += '\t' + word
    tmp += '\n'
    output_file.write(tmp)

    output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_float("tfidf_thresh", 0, "threshold for tfidf")

if __name__ == "__main__":

    flags.InitFlags()

    # getting statistics: slower version, full statistics, memory cost
    #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
    #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
    #             flags.topics_cutoff, flags.window_size, flags.train_only)

    # getting statistics: faster version, partial statistics, memory efficient
    print "Reading vocab"
    [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
    vocab_size = len(vocab_word_index)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: nyt.py Projeto: NetBUG/topicmod

from topicmod.corpora.nyt_reader import *
from topicmod.util import flags

flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus")
flags.define_int("doc_limit", -1, "How many documents")
flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data")
flags.define_float("bigram_limit", 0.9, "p-value for bigrams")

if __name__ == "__main__":
    flags.InitFlags()
    nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit)
    nyt.add_language_list("../../data/new_york_times/editorial_file_list")

    nyt.write_proto(flags.output + "numeric", "nyt", 1000)

Exemplo n.º 8

0

Exibir arquivo

import codecs

from collections import defaultdict

from topicmod.util import flags
from topicmod.ling.dictionary import DingEntries

flags.define_string("vocab", "", "Where we read vocab")
flags.define_float("smoothing", 0.001, "Smoothing amount")
flags.define_float("hit", 1.0, "Value if there's a hit")
flags.define_string("output", "lda/lambda", "Lambda output")

if __name__ == "__main__":
  flags.InitFlags()

  vocab = defaultdict(dict)
  index = defaultdict(int)

  for ii in codecs.open(flags.vocab):
    lang, word = ii.split("\t")
    lang = int(lang)
    vocab[lang][word.strip()] = index[lang]
    index[lang] += 1

  trans = defaultdict(set)
  sum = defaultdict(float)
  for ii in vocab[0]:
    for jj in vocab[1]:
      if ii == jj:
        if vocab[1][jj] % 100 == 0:

Exemplo n.º 9

0

Exibir arquivo

Arquivo: generate_constraints.py Projeto: NetBUG/topicmod

  output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", "", "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_int("cannot_links", 0, "Number of cannot links that we want")
flags.define_int("must_links", 0, "Number of must links that we want")

flags.define_int("num_topics", 20, "Number of topics")
flags.define_bool("train_only", False, "Using only train data to \
                                        generate the constraints")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf")

if __name__ == "__main__":

  flags.InitFlags()

  # getting statistics: slower version, full statistics, memory cost
  #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
  #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
  #             flags.topics_cutoff, flags.window_size, flags.train_only)


  # getting statistics: faster version, partial statistics, memory efficient
  print "Reading vocab"
  [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
  vocab_size = len(vocab_word_index)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: hyperparameter_optimization.py Projeto: NetBUG/topicmod

from numpy import *

from topicmod.external.moremath import *

from topicmod.util import flags

flags.define_string("alpha", None, "The current value of alpha")
flags.define_string("gamma", None, "The current gamma matrix")
flags.define_float("tolerance", 0.001, "Toleranceg for convergence")

NEGATIVE_INFINITY = -float("inf")

def l_alpha(alpha, M, K, gamma_grad):
  val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha)
  val *= M
  for ii in xrange(K):
    val += alpha[ii] * gamma_grad[ii]
  return val

def compute_gamma_gradient(gamma, K):
  """
  Compute the components of the derivative that gamma contributes to. 
  """

  grad = zeros(K)

  for gamma_d in gamma:
    digam_gamma_sum = digamma(sum(gamma_d))
    for ii in xrange(K):
      grad[ii] += digamma(gamma_d[ii]) - digam_gamma_sum