Пример #1
0
def build_vocab(raw_data):
  vocab_freqs = defaultdict(int)
  doc_counts = defaultdict(int)

  for example in raw_data:
    doc = example.sentence
    doc_seen = set()
    for token in doc:
      vocab_freqs[token] += 1
      if token not in doc_seen:
        doc_counts[token] += 1
        doc_seen.add(token)
  
  # Filter out low-occurring terms
  vocab_freqs = dict((term, freq) for term, freq in vocab_freqs.items()
                      if doc_counts[term] > FLAGS.doc_count_threshold)

  # Sort by frequency
  ordered_vocab_freqs = sorted(
      vocab_freqs.items(), key=lambda item: item[1], reverse=True)

  # Limit vocab size
  ordered_vocab_freqs = ordered_vocab_freqs[:MAX_VOCAB_SIZE]

  vocab = [token for token, _ in ordered_vocab_freqs]
  vocab = set(vocab)
  util.write_vocab(vocab, FLAGS.imdb_vocab_file)
  return vocab
Пример #2
0
def build_vocab(raw_data):
    '''collect words in sentence'''
    vocab = set()
    for example in raw_data:
        for w in example.sentence:
            vocab.add(w)

    util.write_vocab(vocab, FLAGS.semeval_vocab_file)
    return vocab
Пример #3
0
    def _build_vocab(all_data):
        print('build vocab')
        data = []
        for task_data in all_data:
            train_data, test_data = task_data
            data.extend(train_data + test_data)
        vocab = fudan.build_vocab(data)
        util.write_vocab(vocab)

        util.stat_length(data)
Пример #4
0
    def _build_vocab(dbpedia_data, semeval_data):
        print('build vocab')
        dbpedia_vocab = dbpedia.build_vocab(dbpedia_data)
        print('dbpedia vocab: %d' % len(dbpedia_vocab))
        semeval_vocab = semeval.build_vocab(semeval_data)
        print('semeval vocab: %d' % len(semeval_vocab))

        vocab = set(dbpedia_vocab)
        vocab.update(semeval_vocab)
        vocab = list(vocab)
        util.write_vocab(vocab)
Пример #5
0
def build_vocab(raw_data):
  '''collect words in sentence'''
  vocab_freqs = defaultdict(int)

  for example in raw_data:
    tokens = example.sentence + example.entity
    for token in tokens:
      vocab_freqs[token] += 1
  
  # Filter out low-occurring terms
  vocab_freqs = dict((term, freq) for term, freq in vocab_freqs.items()
                      if vocab_freqs[term] > 5)

  # Sort by frequency
  ordered_vocab_freqs = sorted(
      vocab_freqs.items(), key=lambda item: item[1], reverse=True)

  # Limit vocab size
  # ordered_vocab_freqs = ordered_vocab_freqs[:MAX_VOCAB_SIZE]

  vocab = [token for token, _ in ordered_vocab_freqs]
  util.write_vocab(vocab, DBPEDIA_VOCAB_FILE)
  return vocab
Пример #6
0
    def _build_vocab(data):
        print('build vocab')
        vocab = fudan.build_vocab(data)
        util.write_vocab(vocab)

        util.stat_length(data)