def show_unk(corpus: SquadCorpus, vec_name: str, context: bool = True, question: bool = True): vecs = corpus.get_pruned_word_vecs(vec_name) docs = corpus.get_train() lower_unk = Counter() unk = Counter() for doc in docs: for para in doc.paragraphs: if context: for sent in para.text: for word in sent: if word not in vecs: unk[word] += 1 word = word.lower() if word not in vecs: lower_unk[word] += 1 if question: for question in para.questions: for word in question.words: if word not in vecs: unk[word] += 1 word = word.lower() if word not in vecs: lower_unk[word] += 1 print("\n".join("%s: %d" % (k, v) for k, v in lower_unk.most_common()))
def show_in_context_unks(corpus: SquadCorpus, vec_name): data = corpus.get_train() np.random.shuffle(data) vecs = corpus.get_pruned_word_vecs(vec_name) for doc in data: paragraphs = list(doc.paragraphs) np.random.shuffle(paragraphs) for para in paragraphs: sentences = list(para.text) + [x.words for x in para.questions] np.random.shuffle(sentences) for words in sentences: for i, word in enumerate(words): if word.lower() not in vecs: words[i] = "{{{" + word + "}}}" print(" ".join(words[max(0, i - 10):min(len(words), i + 10)])) words[i] = word
def main(): corpus = SquadCorpus() if OPTS.normalize_before_ranking: normalizer = WordNormalizer() else: normalizer = None if OPTS.use_vec_dist: word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d') prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer) else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer) orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev() orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs for q in p.questions] new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1) new_lens = [len(p.text) for q in new_data for p in q.paragraphs] print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens))) print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens))) if OPTS.out_file: write_output(OPTS.split, new_data, OPTS.out_file)
def show_features(corpus: SquadCorpus, vec_name): print("Loading train docs") data = corpus.get_train() np.random.shuffle(data) data = data[:100] print("Loading vectors") vecs = corpus.get_pruned_word_vecs(vec_name) fe = BasicWordFeatures() grouped_by_features = defaultdict(Counter) print("start") for doc in data: paragraphs = list(doc.paragraphs) np.random.shuffle(paragraphs) for para in paragraphs: sentences = list(para.text) + [x.words for x in para.questions] np.random.shuffle(sentences) for words in sentences: for i, word in enumerate(words): if word.lower() not in vecs: x = fe.get_word_features(word) for i, val in enumerate(x): if val > 0: grouped_by_features[i][word] += 1 for i in sorted(grouped_by_features.keys()): name = BasicWordFeatures.features_names[i] if name in ["Len"]: continue vals = grouped_by_features[i] print() print("*" * 30) print("%s-%d %d (%d)" % (name, i, len(vals), sum(vals.values()))) for k, v in vals.most_common(30): print("%s: %d" % (k, v))