Exemplo n.º 1
0
from topicmod.util import flags

flags.define_string("input_base", "output/20_news/iter_100_PMI_", \
                                  "Input file folder")
flags.define_string("output_base", "output/20_news/iter_100_PMI", \
                                  "Output file name")
flags.define_string("PMI_file", "PMI_score", \
                                  "Output file name")
flags.define_int("round_num", "5", "Number of iteractive rounds")

if __name__ == "__main__":
  flags.InitFlags()

  results = dict()
  rounds = flags.round_num + 1
  for ii in range(0, rounds):
    filename = flags.input_base + str(ii) + "/" + flags.PMI_file
    inputfile = open(filename, 'r')
    for line in inputfile:
      line = line.strip()
      words = line.split('\t')
      if words[0].find('total') >= 0:
        word_key = -1
      else:
        word_key = int(words[0])
      if word_key not in results.keys():
        results[word_key] = []
      results[word_key].append(words[2])

  outputfile = open(flags.output_base, 'w')
  for tt in results.keys():
Exemplo n.º 2
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus
#from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int("doc_limit", -1, "How many documents \
                                   we add")
flags.define_string("base", "../../data/yn_toy/", \
                      "Where we look for data")
flags.define_string("output", "../../data/yn_toy/numeric", \
                      "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus.add_language("*", ENGLISH)
  print flags.output

  corpus.write_proto(flags.output, "yn_toy")
Exemplo n.º 3
0
import sys
import os

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_int("num_iterations", 1, "Number of iterations")
flags.define_string("model_name", "output/model", "Where we find data")

flags.define_string("corpus", None, "The source corpus")

flags.define_bool("hadoop", False, "Do we use hadoop or local batch")
flags.define_bool("doc_step", True, "Do we call the document-centric parts")
flags.define_bool("merge_step", True,
                  "Do we merge doc step results (and compute new topics)")
flags.define_bool(
    "update_global", True,
    "Do we compute new transition and DP variational parameters")


class Array:
    def __init__(self, name):
        self._rows = {}
        self._name = name

    def __getitem__(self, index):
        if not index in self._rows:
            self._rows[index] = defaultdict(float)
        return self._rows[index]

    def __iter__(self):
Exemplo n.º 4
0
import re
import os.path
from proto.corpus_pb2 import *
from proto.wordnet_file_pb2 import *
from topicmod.util import flags
from topicmod.util.sets import read_pickle, write_pickle

flags.define_int("option", 0, \
   "change the whole documents or just the topics of just the word")
flags.define_string("ldawnoutput", "output/nsf", "ldawn output directory")
flags.define_string("maps", "output/nsf", "mapping files directory")
flags.define_string("wordnet", "wn/output.0", "contraint source")
flags.define_string("assignment_path", None, "Where the assignments live")

def checkSame(cons, old_cons):
    if len(cons) != len(old_cons):
        return False
    for key in cons:
        if key not in old_cons:
            return False
    return True
  
  
def getMappingDicts_reGen(corpusdir, mapsdir, cons):
    # check the old constraint.dict exists or not
    cons_file = corpusdir + "/constraint.set"
    if (not os.path.exists(cons_file)):
        # Regenerate
        (word_wid_dic, wid_did_dic, did_doc_dic) = \
            getNewMappingDicts(corpusdir, mapsdir)
    else:
Exemplo n.º 5
0
    print ""

if __name__ == "__main__":
    from topicmod.util import flags

    mkdir("/tmp/%s" % USER)
    mkdir("/tmp/%s/qsub-scripts" % USER)

    flags.define_string("template", "", "Where we read the template file from")
    flags.define_dict("args", {}, "Substitute values for the template")
    flags.define_dict("defaults", {}, "Default args")
    flags.define_string("wall", "24:00:00", "The wall time")
    flags.define_string("name", "", "Name given to job on cluster")
    flags.define_string("mem", "4gb", "How much memory we give")
    flags.define_string("queue", "shallow", "Which queue do we submit to")
    flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster")
    flags.define_bool("delete_scripts", True, "Do we delete after we're done?")
    flags.define_bool("submit", True, "Do we submit")

    flags.InitFlags()
    template = open(flags.template).read()
    d = flags.defaults

    d["wall"] = flags.wall
    d["mem"] = flags.mem
    for ii in flags.args:
        d[ii] = flags.args[ii]

    if flags.name:
        d["name"] = flags.name
Exemplo n.º 6
0
            len(word_pairs)) + "\t" + str(pmi_score) + "\n"
        infile.write(tmp)

        total_pmi_score += pmi_score

    total_pmi_score /= len(topics.keys())
    tmp = "total" + "\t" + str(len(
        topics.keys())) + "\t" + str(total_pmi_score) + "\n"
    infile.write(tmp)
    infile.close()


flags.define_string("vocab", "", "Where we find the vocab")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_int("topics_cutoff", 30, "Number of topics")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "output/PMI_score", "PMI Output filename")

if __name__ == "__main__":

    flags.InitFlags()

    print "Reading vocab"
    [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
    vocab_size = len(vocab_word_index)

    print "Reading topic words"
    [topics, topic_word_set] = readTopics(flags.model, flags.topics_cutoff)

    #print "Get statistics"
from math import log
from random import random

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_string("vocab", None, "Size of vocabulary")
flags.define_int("num_docs", None, "Numer of documents")
flags.define_int("num_topics", 128, "Number topics")
flags.define_string("model_name", "output/model", "Name of model")

flags.define_bool("finite", False, "Use finite model")
flags.define_bool("ignore_trans", False, "Use only documents")
flags.define_bool("ignore_docs", False, "Use only syntax")
flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible")

flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document")
flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions")
flags.define_int("alpha_trans",  1.0, "DP parameter for transition distributions")
flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution")
flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx")

if __name__ == "__main__":
  flags.InitFlags()

  params = SyntopParameters()

  params.finite = flags.finite
  params.ignore_trans = flags.ignore_trans
  params.ignore_docs = flags.ignore_docs
  params.shortcut_gsl = flags.shortcut_gsl
Exemplo n.º 8
0
  path_assignments_out.close()
  docs_in.close()
  docs_out.close()

  return new_topics + 1


flags.define_string("corpus", None, "Where we find the input corpora")
flags.define_string("mapping", None, "Filename of mapping")
flags.define_string("cons_file", "", "Constraints filename")
flags.define_glob("wordnet", "wn/output.0", "contraint source")
flags.define_string("input_base", "output/nih", "Input filename")
flags.define_string("output_base", "output/nih_ned", "Output filename")
flags.define_string("resume_type", "clear", "resume type: clear or split")
flags.define_string("update_strategy", "doc", "update strategy: term or doc")
flags.define_int("doc_limit", -1, "Number of documents to process")
flags.define_int("num_topics", 0, "Current number of topics")

if __name__ == "__main__":
  flags.InitFlags()

  if re.search("doc", flags.update_strategy):
    update_strategy = 1
  elif re.search("term", flags.update_strategy):
    update_strategy = 0
  else:
    print "Wrong update strategy!"
    exit()

  # Build index if it doesn't already exist
  if os.path.exists(flags.mapping):
Exemplo n.º 9
0
    tmp = str(tt) + "\t" + str(len(word_pairs)) + "\t" + str(pmi_score) + "\n"
    infile.write(tmp)

    total_pmi_score += pmi_score

  total_pmi_score /= len(topics.keys())
  tmp = "total" + "\t" + str(len(topics.keys())) + "\t" + str(total_pmi_score) + "\n"
  infile.write(tmp)
  infile.close()


flags.define_string("vocab", "", "Where we find the vocab")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_int("topics_cutoff", 30, "Number of topics")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "output/PMI_score", "PMI Output filename")

if __name__ == "__main__":

  flags.InitFlags()

  print "Reading vocab"
  [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
  vocab_size = len(vocab_word_index)

  print "Reading topic words"
  [topics, topic_word_set] = readTopics(flags.model, flags.topics_cutoff)

  #print "Get statistics"
Exemplo n.º 10
0
  count = 0
  for (w1, w2) in must.keys():
    if count < must_links_num:
      pmi = must[(w1, w2)]
      tmp = 'MERGE_\t' + w1 + '\t' + w2 + '\n'
      output_file.write(tmp)
      count += 1

  output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", "", "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_int("cannot_links", 0, "Number of cannot links that we want")
flags.define_int("must_links", 0, "Number of must links that we want")

flags.define_int("num_topics", 20, "Number of topics")
flags.define_bool("train_only", False, "Using only train data to \
                                        generate the constraints")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf")

if __name__ == "__main__":

  flags.InitFlags()

  # getting statistics: slower version, full statistics, memory cost
  #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
Exemplo n.º 11
0
from topicmod.util import flags
from topicmod.util.sets import count_line
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import MultilingualVocab
from topicmod.corpora.ml_vocab import Vocab

from collections import defaultdict
import codecs

flags.define_string("output", "", "Where we write output")
flags.define_glob("doc_roots", "", "The document vocab")
flags.define_string("vocab", "", "The vocab file")
flags.define_string("location", "", "Where the data live")
flags.define_int("min_length", 50, "Minimum number of tokens")
flags.define_int("num_docs", -1, "Number of documents we write")
flags.define_string("language", "en", "What language this is")

kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE}


def lda_line(filename, full_vocab, filtered_vocab):
  d = defaultdict(int)

  doc = Document()
  doc.ParseFromString(open(filename, 'rb').read())

  num_words = 0
  for sent in doc.sentences:
    for word in sent.words:
Exemplo n.º 12
0
    word_senses_count[word] = 0
    count_word += 1
    tmp = word
    for pos in multipaths[word]:
      tmp += '\t' + pos
      for index in multipaths[word][pos]:
        word_senses_count[word] += 1
        count_sense += 1
        tmp += '\t' + str(index)
    if word_senses_count[word] > 1:
      im_words += word + " "
    outfile.write(tmp + '\n')
  outfile.write("\nThe total number of cons words: " + str(count_word) + "\n")
  outfile.write("\nThe total number of cons words senses: " + str(count_sense) + "\n")
  outfile.write("\nInteresting words: " + im_words + "\n")
  outfile.close()


flags.define_string("vocab", None, "The input vocab")
flags.define_string("output", None, "The output constraint file")
flags.define_int("num_cons", 0, "The number of constraints we want")

if __name__ == "__main__":

  flags.InitFlags()
  wordnet_path = "../../../data/wordnet/" 
  eng_wn = load_wn("3.0", wordnet_path, "wn")
  vocab = readVocab(flags.vocab)
  generateCons(vocab, eng_wn, flags.output, flags.num_cons)
  
Exemplo n.º 13
0
from topicmod.corpora.wacky import *
from topicmod.util import flags

flags.define_string("wackypedia_base", "../../data/wackypedia/compressed/",
                    "Where we find the wackypedia corpus")
flags.define_string("output", "/tmp/jbg/wackypedia/", "Where we write output")
flags.define_int("doc_limit", 10, "Max number of docs")
flags.define_list("langs", ["en"], "Which languages")

if __name__ == "__main__":
  flags.InitFlags()
  wacky = WackyCorpus(flags.wackypedia_base, flags.doc_limit)
  for ii in flags.langs:
    wacky.add_language("wackypedia_%s*.gz" % ii)

  wacky.write_proto(flags.output + "numeric",
                    "wpdia", 10000)
Exemplo n.º 14
0
import sys
import os

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_int("num_iterations", 1, "Number of iterations")
flags.define_string("model_name", "output/model", "Where we find data")

flags.define_string("corpus", None, "The source corpus")

flags.define_bool("hadoop", False, "Do we use hadoop or local batch")
flags.define_bool("doc_step", True, "Do we call the document-centric parts")
flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)")
flags.define_bool("update_global", True, "Do we compute new transition and DP variational parameters")

class Array:
  def __init__(self, name):
    self._rows = {}
    self._name = name

  def __getitem__(self, index):
    if not index in self._rows:
      self._rows[index] = defaultdict(float)
    return self._rows[index]

  def __iter__(self):
    for ii in self._rows:
      yield self._rows[ii]

  def parse(self, key, val):
Exemplo n.º 15
0
from topicmod.util import flags
from topicmod.util.sets import count_line
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import MultilingualVocab
from topicmod.corpora.ml_vocab import Vocab

from collections import defaultdict
import codecs

flags.define_string("output", "", "Where we write output")
flags.define_glob("doc_roots", "", "The document vocab")
flags.define_string("vocab", "", "The vocab file")
flags.define_string("location", "", "Where the data live")
flags.define_int("min_length", 50, "Minimum number of tokens")
flags.define_int("num_docs", -1, "Number of documents we write")
flags.define_string("language", "en", "What language this is")

kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE}


def lda_line(filename, full_vocab, filtered_vocab):
    d = defaultdict(int)

    doc = Document()
    doc.ParseFromString(open(filename, 'rb').read())

    num_words = 0
    for sent in doc.sentences:
        for word in sent.words:
            new_word = full_vocab.get_word(doc.language, word.token)
Exemplo n.º 16
0
  output_file.write(tmp)

  tmp = 'MERGE_'
  for word in merge2:
    tmp += '\t' + word
  tmp += '\n'
  output_file.write(tmp)

  output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_float("tfidf_thresh", 0, "threshold for tfidf")

if __name__ == "__main__":

  flags.InitFlags()

  # getting statistics: slower version, full statistics, memory cost
  #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
  #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
  #             flags.topics_cutoff, flags.window_size, flags.train_only)


  # getting statistics: faster version, partial statistics, memory efficient
  print "Reading vocab"
  [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
Exemplo n.º 17
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags

# Input
flags.define_string("doc_filter", None, "Files to filter out")
flags.define_string("vocab", None, "The file that defines the vocab")
flags.define_string("state_file", None, \
                    "The state file that create the corpus")

# Output
flags.define_string("state_output", None, "Where we write state file")
flags.define_string("corpus_output_path", None, "Where we write the corpus")
flags.define_string("corpus_name", "NIH", "Name of the corpus")

# Options
flags.define_int("docs_per_index", 5000, "Number of docs per section")
flags.define_int("doc_limit", -1, "Cap on number of documents")


class MalletAssignment:

  def __init__(self, line, debug=False):
    if debug:
      for ii in xrange(len(line.split())):
        print ii, line.split()[ii]
    self.doc, foo, self.index, self.term_id, self.term, self.assignment = \
      line.split()
    self.doc = int(self.doc)
    self.index = int(self.index)
    self.term_id = int(self.term_id)
    self.assignment = int(self.assignment)
Exemplo n.º 18
0
#
# wacky_reducer.py
#
# File to turn protocol buffers into a test-only input file readable by
# mapreduce implementation of syntactic topic model.

from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False, "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None, "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
        self.use_lemma = use_lemma
Exemplo n.º 19
0
from topicmod.corpora.nyt_reader import *
from topicmod.util import flags

flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus")
flags.define_int("doc_limit", -1, "How many documents")
flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data")
flags.define_float("bigram_limit", 0.9, "p-value for bigrams")

if __name__ == "__main__":
    flags.InitFlags()
    nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit)
    nyt.add_language_list("../../data/new_york_times/editorial_file_list")

    nyt.write_proto(flags.output + "numeric", "nyt", 1000)
Exemplo n.º 20
0
from math import log
from random import random

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_string("vocab", None, "Size of vocabulary")
flags.define_int("num_docs", None, "Numer of documents")
flags.define_int("num_topics", 128, "Number topics")
flags.define_string("model_name", "output/model", "Name of model")

flags.define_bool("finite", False, "Use finite model")
flags.define_bool("ignore_trans", False, "Use only documents")
flags.define_bool("ignore_docs", False, "Use only syntax")
flags.define_bool("shortcut_gsl", False,
                  "Use closed form updates when possible")

flags.define_int("max_doc_iterations", 5,
                 "Number of e-step rounds per-document")
flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions")
flags.define_int("alpha_trans", 1.0,
                 "DP parameter for transition distributions")
flags.define_int("alpha_top", 1.0,
                 "DP parameter for top-level stick-breaking distribution")
flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx")

if __name__ == "__main__":
    flags.InitFlags()

    params = SyntopParameters()
Exemplo n.º 21
0

if __name__ == "__main__":
    from topicmod.util import flags

    mkdir("/tmp/%s" % USER)
    mkdir("/tmp/%s/qsub-scripts" % USER)

    flags.define_string("template", "", "Where we read the template file from")
    flags.define_dict("args", {}, "Substitute values for the template")
    flags.define_dict("defaults", {}, "Default args")
    flags.define_string("wall", "24:00:00", "The wall time")
    flags.define_string("name", "", "Name given to job on cluster")
    flags.define_string("mem", "4gb", "How much memory we give")
    flags.define_string("queue", "shallow", "Which queue do we submit to")
    flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster")
    flags.define_bool("delete_scripts", True, "Do we delete after we're done?")
    flags.define_bool("submit", True, "Do we submit")

    flags.InitFlags()
    template = open(flags.template).read()
    d = flags.defaults

    d["wall"] = flags.wall
    d["mem"] = flags.mem
    for ii in flags.args:
        d[ii] = flags.args[ii]

    if flags.name:
        d["name"] = flags.name
Exemplo n.º 22
0
#
# wacky_reducer.py
#
# File to turn protocol buffers into a test-only input file readable by
# mapreduce implementation of syntactic topic model.

from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False,
                  "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None,
                    "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index",
                    "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
Exemplo n.º 23
0
    output_file.write(tmp)

    tmp = 'MERGE_'
    for word in merge2:
        tmp += '\t' + word
    tmp += '\n'
    output_file.write(tmp)

    output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_float("tfidf_thresh", 0, "threshold for tfidf")

if __name__ == "__main__":

    flags.InitFlags()

    # getting statistics: slower version, full statistics, memory cost
    #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
    #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
    #             flags.topics_cutoff, flags.window_size, flags.train_only)

    # getting statistics: faster version, partial statistics, memory efficient
    print "Reading vocab"
    [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
    vocab_size = len(vocab_word_index)
    rank = tfidf
  else:
    rank = frequency

  o = codecs.open(outputname, 'w', 'utf-8')
  for ii in rank:
    count = 0
    for jj in rank[ii]:
      count += 1
      if count <= vocab_limit and frequency[ii][jj] >= freq_limit:
        word = vocab[ii][jj]
        o.write(u"%i\t%s\t%f\t%i\n" % (ii, word, tfidf[ii][jj], frequency[ii][jj]))
        
  o.close()


flags.define_string("proto_corpus", None, "The proto files")
flags.define_bool("lemma", False, "Use lemma or tokens")
flags.define_bool("select_tfidf", False, "select the vocab by tfidf or frequency")
flags.define_string("output", "", "Where we output the preprocessed data")
flags.define_string("vocab", None, "Where we output the vocab")
flags.define_int("vocab_limit", 10000, "The vocab size")
flags.define_int("freq_limit", 20, "The minimum frequency of each word")

if __name__ == "__main__":

  flags.InitFlags()  
  [vocab, tfidf, frequency] = gen_files(flags.proto_corpus, flags.output, flags.lemma)
  gen_vocab(vocab, tfidf, frequency, flags.select_tfidf, flags.vocab, flags.vocab_limit, flags.freq_limit)

Exemplo n.º 25
0
from topicmod.util import flags
from topicmod.corpora.vocab_compiler import VocabCompiler

flags.define_glob("corpus_parts", None, "Where we look for vocab")
flags.define_filename("output", None, "Where we write the new vocab")
flags.define_int("min_freq", 10, "Minimum frequency for inclusion")
flags.define_int("vocab_limit", 5000, "Maximum vocab size")
flags.define_bool("exclude_stop", True, "Do we throw out stop words")
flags.define_bool("exclude_punc", True, "Do we exclude punctuation")
flags.define_bool("exclude_digits", True, "Do we exclude digits")
flags.define_list("special_stop", [], "Special stop words")
flags.define_int("min_length", 3, "Minimum length for tokens")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("bigram", False, "Use bigrams")

if __name__ == "__main__":
  flags.InitFlags()

  assert not (flags.stem and flags.bigram), "Can't use stem and bigram"

  v = VocabCompiler()
  for ii in flags.corpus_parts:
    print ii
    v.addVocab(ii, flags.exclude_stop, flags.special_stop, \
                 flags.exclude_punc, flags.exclude_digits, \
                 flags.stem, flags.bigram, flags.min_length)
  v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
Exemplo n.º 26
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus

# from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int(
    "doc_limit",
    -1,
    "How many documents \
                                   we add",
)
flags.define_string("base", "../../data/yn_toy/", "Where we look for data")
flags.define_string("output", "../../data/yn_toy/numeric", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*", ENGLISH)
    print flags.output

    corpus.write_proto(flags.output, "yn_toy")
Exemplo n.º 27
0
from random import random
from collections import defaultdict
import os

import numpy
from numpy import zeros
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab")
flags.define_int("topic_output_size", 15, "Number of words to display when we output topics")

ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 
 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 
 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]},
Exemplo n.º 28
0
from collections import defaultdict

from topicmod.util import flags
from topicmod.util.wordnet import load_wn
from topicmod.ling.dictionary import *
from topicmod.ling.snowball_wrapper import Snowball
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original vocab will be included
# in the generated wordnet, generate a new vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")
Exemplo n.º 29
0
from collections import defaultdict

from topicmod.util import flags
from topicmod.util.wordnet import load_wn
from topicmod.ling.dictionary import *
from topicmod.ling.snowball_wrapper import Snowball
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original
# vocab will be included  in the generated wordnet, generate a new
# vocab only contains the words in the wordnet.
Exemplo n.º 30
0
  infile = open(infilename, 'r')
  vocab = defaultdict(FreqDist)
  for line in infile:
    line = line.strip()
    ww = line.split('\t')
    lang = ww[0]
    if source[lang][ww[1]] == 0:
      print source[lang][ww[1]], ww[1]
    vocab[lang].inc(ww[1], source[lang][ww[1]])
  infile.close()

  outfile = codecs.open(outfilename, 'w', 'utf-8')
  for ii in vocab:
    for jj in vocab[ii]:
      outfile.write(u"%s\t%s\n" % (ii, jj))
      #outfile.write(u"%s\t%s\t%f\t%i\n" % (ii, jj, tfidf[ii][jj], frequency[ii][jj]))
  outfile.close()


flags.define_string("stats_vocab", None, "The proto files")
flags.define_string("input_vocab", None, "Where we get the original vocab")
flags.define_int("option", 0, "1: tfidf; others: frequency")
flags.define_string("sorted_vocab", None, "Where we output the vocab")

if __name__ == "__main__":

  flags.InitFlags()  
  [tfidf, frequency] = readStats(flags.stats_vocab)
  
  sortVocab(flags.input_vocab, tfidf, frequency, flags.option, flags.sorted_vocab)
Exemplo n.º 31
0
from random import random
from collections import defaultdict
import os

import numpy
from numpy import zeros
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta",
                    "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc",
                    "Where we write vocab")
flags.define_int("topic_output_size", 15,
                 "Number of words to display when we output topics")

ml_vocab = [{
Exemplo n.º 32
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags

# Input
flags.define_string("doc_filter", None, "Files to filter out")
flags.define_string("vocab", None, "The file that defines the vocab")
flags.define_string("state_file", None, \
                    "The state file that create the corpus")

# Output
flags.define_string("state_output", None, "Where we write state file")
flags.define_string("corpus_output_path", None, "Where we write the corpus")
flags.define_string("corpus_name", "NIH", "Name of the corpus")

# Options
flags.define_int("docs_per_index", 5000, "Number of docs per section")
flags.define_int("doc_limit", -1, "Cap on number of documents")


class MalletAssignment:
    def __init__(self, line, debug=False):
        if debug:
            for ii in xrange(len(line.split())):
                print ii, line.split()[ii]
        self.doc, foo, self.index, self.term_id, self.term, self.assignment = \
          line.split()
        self.doc = int(self.doc)
        self.index = int(self.index)
        self.term_id = int(self.term_id)
        self.assignment = int(self.assignment)
Exemplo n.º 33
0
    # write coccurance:
    outputfile = self._output_dir + "/cooccurance.txt"
    outfile = open(outputfile, 'w')
    for w1 in self._cooccur.keys():
      for w2 in self._cooccur[w1].keys():
        if self._cooccur[w1][w2] != 0:
          tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n"
          outfile.write(tmp)
    outfile.close()


flags.define_string("corpus", None, "Where we find the input corpora")
flags.define_string("proto_corpus", None, "Where we find the input proto corpora")
flags.define_string("vocab", "", "The model files folder of topic models")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "PMI_stat/20_news", "PMI stat output filename")
flags.define_int("option", "2", "0: 20_news; 1: wikipedia")

if __name__ == "__main__":
  flags.InitFlags()
  # {0: 'english', 1: 'german'}
  lang = 0
  cp = corpusParser(lang, flags.vocab, flags.corpus, flags.window_size, flags.output)
  if flags.option == 0:
    cp.parseCorpus20news()
    get_tfidf(flags.proto_corpus, flags.vocab, flags.output)
  elif flags.option == 1:
    cp.parseCorpusWiki()
    get_tfidf(flags.proto_corpus, flags.vocab, flags.output)
  elif flags.option == 2: