Exemplo n.º 1
0
def nyt_tfidf():
    vocab_file = "vocab/nyt.voc"
    proto_corpus_dir = "output/nyt/iter_1_all/model_topic_assign/"
    output_dir = "PMI_stat/nyt/"
    get_tfidf(proto_corpus_dir, vocab_file, output_dir)
Exemplo n.º 2
0
def get_20news_tfidf():
    vocab_file = "vocab/20_news_stem_all.voc"
    proto_corpus_dir = "output/20_news_stem_tfidf/iter_1_all/model_topic_assign/"
    #proto_corpus_dir = "../../data/20_news_date/numeric"
    output_dir = "PMI_stat/20_news_stem_tfidf/"
    get_tfidf(proto_corpus_dir, vocab_file, output_dir)
Exemplo n.º 3
0
def nyt_tfidf():
  vocab_file = "vocab/nyt.voc"
  proto_corpus_dir = "output/nyt/iter_1_all/model_topic_assign/"
  output_dir = "PMI_stat/nyt/"
  get_tfidf(proto_corpus_dir, vocab_file, output_dir)
Exemplo n.º 4
0
    for w1 in self._cooccur.keys():
      for w2 in self._cooccur[w1].keys():
        if self._cooccur[w1][w2] != 0:
          tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n"
          outfile.write(tmp)
    outfile.close()


flags.define_string("corpus", None, "Where we find the input corpora")
flags.define_string("proto_corpus", None, "Where we find the input proto corpora")
flags.define_string("vocab", "", "The model files folder of topic models")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "PMI_stat/20_news", "PMI stat output filename")
flags.define_int("option", "2", "0: 20_news; 1: wikipedia")

if __name__ == "__main__":
  flags.InitFlags()
  # {0: 'english', 1: 'german'}
  lang = 0
  cp = corpusParser(lang, flags.vocab, flags.corpus, flags.window_size, flags.output)
  if flags.option == 0:
    cp.parseCorpus20news()
    get_tfidf(flags.proto_corpus, flags.vocab, flags.output)
  elif flags.option == 1:
    cp.parseCorpusWiki()
    get_tfidf(flags.proto_corpus, flags.vocab, flags.output)
  elif flags.option == 2:
    cp.parseCorpusNyt()
    get_tfidf(flags.proto_corpus, flags.vocab, flags.output)

Exemplo n.º 5
0
def get_20news_tfidf():
  vocab_file = "vocab/20_news_stem_all.voc"
  proto_corpus_dir = "output/20_news_stem_tfidf/iter_1_all/model_topic_assign/"
  #proto_corpus_dir = "../../data/20_news_date/numeric"
  output_dir = "PMI_stat/20_news_stem_tfidf/"
  get_tfidf(proto_corpus_dir, vocab_file, output_dir)