def extract(args): title, text = args sentences = extract_wiki_sentences( title, text, n_wiki_sentences, replace_title_mentions=replace_title_mentions) return title, sentences
def training_data(self) -> TrainingData: wiki_lookup = Wikipedia() wiki_content = [] wiki_answers = [] for ans in self.answers: if ans not in wiki_lookup: continue wiki_page = wiki_lookup[ans] if len(wiki_page.text) != 0: sentences = extract_wiki_sentences( ans, wiki_page.text, self.n_sentences, replace_title_mentions=self.replace_title_mentions ) for sent in sentences: wiki_content.append([sent]) wiki_answers.append(ans) return wiki_content, wiki_answers, None
def training_data(self) -> TrainingData: wiki_lookup = Wikipedia() wiki_content = [] wiki_answers = [] for ans in self.answers: if ans not in wiki_lookup: continue wiki_page = wiki_lookup[ans] if len(wiki_page.text) != 0: sentences = extract_wiki_sentences( ans, wiki_page.text, self.n_sentences, replace_title_mentions=self.replace_title_mentions) for sent in sentences: wiki_content.append([sent]) wiki_answers.append(ans) return wiki_content, wiki_answers, None
def __init__(self, path, qnum_field, sent_field, page_field, text_field, unigram_field, bigram_field, trigram_field, example_mode='sentence', use_wiki=False, n_wiki_sentences=3, replace_title_mentions='', **kwargs): from unidecode import unidecode if use_wiki and 'train' in path: base_path = os.path.dirname(path) filename = os.path.basename(s3_wiki) output_file = os.path.join(base_path, filename) if not os.path.exists(output_file): download_from_url(s3_wiki, output_file) with open(output_file) as f: self.wiki_lookup = json.load(f) else: self.wiki_lookup = {} self.path = path self.example_mode = example_mode text_dependent_fields = [] if text_field is not None: text_dependent_fields.append(('text', text_field)) if unigram_field is not None: text_dependent_fields.append(('unigram', unigram_field)) if bigram_field is not None: text_dependent_fields.append(('bigram', bigram_field)) if trigram_field is not None: text_dependent_fields.append(('trigram', trigram_field)) example_fields = { 'qnum': [('qnum', qnum_field)], 'sent': [('sent', sent_field)], 'page': [('page', page_field)], 'text': text_dependent_fields } examples = [] answer_set = set() with open(path) as f: for ex in json.load(f)['questions']: if example_mode == 'sentence': sentences = ex['sentences'] for i, s in enumerate(sentences): examples.append( Example.fromdict( { 'qnum': ex['qnum'], 'sent': i, 'text': unidecode(s), 'page': ex['page'] }, example_fields)) answer_set.add(ex['page']) elif example_mode == 'question': raise NotImplementedError( 'Question tokenization is not implemented yet, submit a PR!' ) elif example_mode == 'runs': raise NotImplementedError( 'Run tokenization is not implemented yet, submit a PR!' ) else: raise ValueError( f"Valid modes are 'sentence', 'question', and 'runs', but '{example_mode}' was given" ) if use_wiki and n_wiki_sentences > 0 and 'train' in path: for page in answer_set: if page in self.wiki_lookup: sentences = extract_wiki_sentences( page, self.wiki_lookup[page]['text'], n_wiki_sentences, replace_title_mentions=replace_title_mentions) for i, s in enumerate(sentences): examples.append( Example.fromdict( { 'qnum': -1, 'sent': i, 'text': s, 'page': page }, example_fields)) dataset_fields = { 'qnum': qnum_field, 'sent': sent_field, 'page': page_field, } if text_field is not None: dataset_fields['text'] = text_field if unigram_field is not None: dataset_fields['unigram'] = unigram_field if bigram_field is not None: dataset_fields['bigram'] = bigram_field if trigram_field is not None: dataset_fields['trigram'] = trigram_field super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)