def how_many_times_character_appears(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.5) letters_frequency = extract_letters_frequency(text) if self.limit_classes is not None: letters_frequency = dict( filter(lambda entry: entry[1] < self.limit_classes, letters_frequency.items())) random_character = np.random.choice(list(letters_frequency.keys())) answer = letters_frequency[random_character] start_idx = passage.index(text) spans = [(start_idx + i, start_idx + i + 1) for i, x in enumerate(text) if x == random_character] question = randomize_instances_occurance_question( string_asked_about=f'\'{random_character}\'', string_category='character', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='CHAR') return sample_details
def _how_many_common_template(self, passage, ner_categories, singular_string_category, plural_string_category): order_of_common = np.random.randint(0, 3) order_name = [ 'the most common', 'the second most common', 'the third most common' ][order_of_common] context_scope, sen_index, text = randomize_scope(passage, passage_prob=1.00) ner_and_spans = extract_ner(text, ner_categories) spans = [(entry[1], entry[2]) for entry in ner_and_spans] tokens_text = [entry[0] for entry in ner_and_spans] sorted_tokens_by_frequency = Counter(tokens_text).most_common() most_common_token = sorted_tokens_by_frequency[order_of_common] answer = most_common_token[1] question = randomize_common_question( string_category=f'{order_name} {singular_string_category}', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SPAN') return sample_details
def _select_template(self, passage, pos_to_extract, string_category): order_of_common = np.random.randint(0, 3) order_name = [ 'the most common', 'the second most common', 'the third most common' ][order_of_common] context_scope, sen_index, text = randomize_scope(passage, passage_prob=1.00) spans, tokens_text = extract_pos(text, pos_to_extract=pos_to_extract) sorted_tokens_by_frequency = Counter(tokens_text).most_common() most_common_token = sorted_tokens_by_frequency[order_of_common] answer = most_common_token[1] question = randomize_common_question( string_category=f'{order_name} {string_category}', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SPAN') return sample_details
def how_many_nouns(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.25) spans, _ = extract_pos(text, pos_to_extract=['NN']) answer = len(spans) question = randomize_total_question(string_category=f'noun', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SPAN') return sample_details
def how_many_sentences_in_total(self, passage): context_scope, sen_index, passage = randomize_scope(passage, passage_prob=1.0) sentences = extract_sentences(passage) spans = [(passage.index(s), passage.index(s) + len(s)) for s in sentences] answer = len(sentences) question = randomize_total_question(string_category='sentence', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SENTENCE') return sample_details
def how_many_words_in_total(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.0) words = extract_words(text) start_idx = passage.index(text) spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in words] answer = len(words) question = randomize_total_question(string_category='word', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='WORD') return sample_details
def how_many_numbers(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.5) start_idx = passage.index(text) num_words, num_indices = extract_passage_numbers(text) spans = [(start_idx + idx, start_idx + idx + len(str(w))) for w, idx in zip(num_words, num_indices)] answer = len(num_words) question = randomize_total_question(string_category='number', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='WORD') return sample_details
def _how_many_template(self, passage, ner_categories, singular_string_category, plural_string_category): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.85) ner_and_spans = extract_ner(text, ner_categories) spans = [(entry[1], entry[2]) for entry in ner_and_spans] answer = len(spans) question = randomize_ner_question( singular_string_category=singular_string_category, plural_string_category=plural_string_category, context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='SPAN') return sample_details
def how_many_words_shorter_than(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.2) words = extract_words(text) target_len = np.random.random_integers(5) target_words = list(filter(lambda tok: len(str(tok[0])) < target_len, words)) start_idx = passage.index(text) spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in target_words] answer = len(target_words) question = randomize_total_question(string_category=f'words shorter than {target_len} character', context_scope=context_scope) sample_details = dict( question=question, answer=answer, spans=spans, span_type='WORD' ) return sample_details
def how_many_title_case_words_in_total(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.5) words = extract_words(text) title_words = list( filter(lambda tok: len(tok.text) > 0 and tok.text[0].isupper(), words)) start_idx = passage.index(text) spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in title_words] answer = len(title_words) question = randomize_total_question(string_category='title case word', context_scope=context_scope) sample_details = dict(question=question, answer=answer, spans=spans, span_type='WORD') return sample_details
def how_many_times_vowels_appears(self, passage): context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.0) letters_frequency = extract_letters_frequency(text) if self.limit_classes is not None: letters_frequency = dict(filter(lambda entry: entry[1] < self.limit_classes, letters_frequency.items())) vowel_entries = dict(filter(lambda e: e[0].lower() in ('a', 'e', 'i', 'u', 'o'), letters_frequency.items())) answer = sum(vowel_entries.values()) start_idx = passage.index(text) spans = [(start_idx + i, start_idx + i + 1) for i, x in enumerate(text) if x in vowel_entries.keys()] question = randomize_total_question(string_category='vowel character', context_scope=context_scope) sample_details = dict( question=question, answer=answer, spans=spans, span_type='CHAR' ) return sample_details