def _read(self, file_path: str): logger.info("Opening base tarball file at %s", self._base_tarball_path) base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r') if 'unfiltered' in file_path: logger.info("Opening unfiltered tarball file at %s", self._unfiltered_tarball_path) unfiltered_tarball = tarfile.open(cached_path(self._unfiltered_tarball_path), 'r') logger.info("Loading question file from tarball") data_json = json.loads(unfiltered_tarball.extractfile(file_path).read().decode('utf-8')) else: logger.info("Loading question file from tarball") path = os.path.join('qa', file_path) data_json = json.loads(base_tarball.extractfile(path).read().decode('utf-8')) logger.info("Reading the dataset") for question_json in data_json['Data']: question_text = question_json['Question'] question_tokens = self._tokenizer.tokenize(question_text) evidence_files: List[List[str]] = [] # contains lines from each evidence file if 'web' in file_path: for result in question_json['SearchResults']: filename = result['Filename'] evidence_file = base_tarball.extractfile(os.path.join("evidence", "web", filename)) evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()]) else: for result in question_json['EntityPages']: filename = result['Filename'] evidence_file = base_tarball.extractfile(os.path.join("evidence", "wikipedia", filename)) evidence_files.append([line.decode('utf-8') for line in evidence_file.readlines()]) answer_json = question_json['Answer'] human_answers = [util.normalize_text(answer) for answer in answer_json.get('HumanAnswers', [])] answer_texts = answer_json['NormalizedAliases'] + human_answers for paragraph in self.pick_paragraphs(evidence_files, question_text, answer_texts): paragraph_tokens = self._tokenizer.tokenize(paragraph) token_spans = util.find_valid_answer_spans(paragraph_tokens, answer_texts) if not token_spans: # For now, we'll just ignore instances that we can't find answer spans for. # Maybe we can do something smarter here later, but this will do for now. continue instance = self.text_to_instance(question_text, paragraph, token_spans, answer_texts, question_tokens, paragraph_tokens) yield instance
def _read(self, file_path: str): logger.info("Opening base tarball file at %s", self._base_tarball_path) base_tarball = tarfile.open(cached_path(self._base_tarball_path), 'r') if 'unfiltered' in file_path: logger.info("Opening unfiltered tarball file at %s", self._unfiltered_tarball_path) unfiltered_tarball = tarfile.open( cached_path(self._unfiltered_tarball_path), 'r') logger.info("Loading question file from tarball") data_json = json.loads( unfiltered_tarball.extractfile(file_path).read().decode( 'utf-8')) else: logger.info("Loading question file from tarball") path = os.path.join('qa', file_path) data_json = json.loads( base_tarball.extractfile(path).read().decode('utf-8')) logger.info("Reading the dataset") for question_json in Tqdm.tqdm(data_json['Data']): question_text = question_json['Question'] question_tokens = self._tokenizer.tokenize(question_text) evidence_files: List[List[str]] = [ ] # contains lines from each evidence file if 'web' in file_path: for result in question_json['SearchResults']: filename = result['Filename'] evidence_file = base_tarball.extractfile( os.path.join("evidence", "web", filename)) evidence_files.append([ line.decode('utf-8') for line in evidence_file.readlines() ]) else: for result in question_json['EntityPages']: filename = result['Filename'] evidence_file = base_tarball.extractfile( os.path.join("evidence", "wikipedia", filename)) evidence_files.append([ line.decode('utf-8') for line in evidence_file.readlines() ]) answer_json = question_json['Answer'] human_answers = [ util.normalize_text(answer) for answer in answer_json.get('HumanAnswers', []) ] answer_texts = answer_json['NormalizedAliases'] + human_answers for paragraph in self.pick_paragraphs(evidence_files, question_text, answer_texts): paragraph_tokens = self._tokenizer.tokenize(paragraph) token_spans = util.find_valid_answer_spans( paragraph_tokens, answer_texts) if not token_spans: # For now, we'll just ignore instances that we can't find answer spans for. # Maybe we can do something smarter here later, but this will do for now. continue instance = self.text_to_instance(question_text, paragraph, token_spans, answer_texts, question_tokens, paragraph_tokens) yield instance