def how_many_nouns(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.25) spans, _ = extract_pos(text, pos_to_extract=['NN']) answer = len(spans) question = randomize_total_question(string_category=f'noun', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SPAN') return sample_details
def how_many_sentences_in_total(self, passage): context_scope, sen_index, passage = randomize_scope(passage, passage_prob=1.0) sentences = extract_sentences(passage) spans = [(passage.index(s), passage.index(s) + len(s)) for s in sentences] answer = len(sentences) question = randomize_total_question(string_category='sentence', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SENTENCE') return sample_details
def how_many_words_in_total(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.0) words = extract_words(text) start_idx = passage.index(text) spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in words] answer = len(words) question = randomize_total_question(string_category='word', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='WORD') return sample_details
def how_many_numbers(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.5) start_idx = passage.index(text) num_words, num_indices = extract_passage_numbers(text) spans = [(start_idx + idx, start_idx + idx + len(str(w))) for w, idx in zip(num_words, num_indices)] answer = len(num_words) question = randomize_total_question(string_category='number', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='WORD') return sample_details
def how_many_words_shorter_than(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.2) words = extract_words(text) target_len = np.random.random_integers(5) target_words = list(filter(lambda tok: len(str(tok[0])) < target_len, words)) start_idx = passage.index(text) spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in target_words] answer = len(target_words) question = randomize_total_question(string_category=f'words shorter than {target_len} character', context_scope=context_scope) sample_details = dict( question=question, answer=answer, spans=spans, span_type='WORD' ) return sample_details
def how_many_title_case_words_in_total(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.5) words = extract_words(text) title_words = list( filter(lambda tok: len(tok.text) > 0 and tok.text[0].isupper(), words)) start_idx = passage.index(text) spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in title_words] answer = len(title_words) question = randomize_total_question(string_category='title case word', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='WORD') return sample_details
def how_many_times_vowels_appears(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.0) letters_frequency = extract_letters_frequency(text) if self.limit_classes is not None: letters_frequency = dict(filter(lambda entry: entry[1] < self.limit_classes, letters_frequency.items())) vowel_entries = dict(filter(lambda e: e[0].lower() in ('a', 'e', 'i', 'u', 'o'), letters_frequency.items())) answer = sum(vowel_entries.values()) start_idx = passage.index(text) spans = [(start_idx + i, start_idx + i + 1) for i, x in enumerate(text) if x in vowel_entries.keys()] question = randomize_total_question(string_category='vowel character', context_scope=context_scope) sample_details = dict( question=question, answer=answer, spans=spans, span_type='CHAR' ) return sample_details