def build_vocab(raw_data): vocab_freqs = defaultdict(int) doc_counts = defaultdict(int) for example in raw_data: doc = example.sentence doc_seen = set() for token in doc: vocab_freqs[token] += 1 if token not in doc_seen: doc_counts[token] += 1 doc_seen.add(token) # Filter out low-occurring terms vocab_freqs = dict((term, freq) for term, freq in vocab_freqs.items() if doc_counts[term] > FLAGS.doc_count_threshold) # Sort by frequency ordered_vocab_freqs = sorted( vocab_freqs.items(), key=lambda item: item[1], reverse=True) # Limit vocab size ordered_vocab_freqs = ordered_vocab_freqs[:MAX_VOCAB_SIZE] vocab = [token for token, _ in ordered_vocab_freqs] vocab = set(vocab) util.write_vocab(vocab, FLAGS.imdb_vocab_file) return vocab
def build_vocab(raw_data): '''collect words in sentence''' vocab = set() for example in raw_data: for w in example.sentence: vocab.add(w) util.write_vocab(vocab, FLAGS.semeval_vocab_file) return vocab
def _build_vocab(all_data): print('build vocab') data = [] for task_data in all_data: train_data, test_data = task_data data.extend(train_data + test_data) vocab = fudan.build_vocab(data) util.write_vocab(vocab) util.stat_length(data)
def _build_vocab(dbpedia_data, semeval_data): print('build vocab') dbpedia_vocab = dbpedia.build_vocab(dbpedia_data) print('dbpedia vocab: %d' % len(dbpedia_vocab)) semeval_vocab = semeval.build_vocab(semeval_data) print('semeval vocab: %d' % len(semeval_vocab)) vocab = set(dbpedia_vocab) vocab.update(semeval_vocab) vocab = list(vocab) util.write_vocab(vocab)
def build_vocab(raw_data): '''collect words in sentence''' vocab_freqs = defaultdict(int) for example in raw_data: tokens = example.sentence + example.entity for token in tokens: vocab_freqs[token] += 1 # Filter out low-occurring terms vocab_freqs = dict((term, freq) for term, freq in vocab_freqs.items() if vocab_freqs[term] > 5) # Sort by frequency ordered_vocab_freqs = sorted( vocab_freqs.items(), key=lambda item: item[1], reverse=True) # Limit vocab size # ordered_vocab_freqs = ordered_vocab_freqs[:MAX_VOCAB_SIZE] vocab = [token for token, _ in ordered_vocab_freqs] util.write_vocab(vocab, DBPEDIA_VOCAB_FILE) return vocab
def _build_vocab(data): print('build vocab') vocab = fudan.build_vocab(data) util.write_vocab(vocab) util.stat_length(data)