def test_open_compressed(self): uncompressed_file = self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt" with open_compressed(uncompressed_file) as f: uncompressed_lines = [line.strip() for line in f] for suffix in ["bz2", "gz"]: compressed_file = f"{uncompressed_file}.{suffix}" with open_compressed(compressed_file) as f: compressed_lines = [line.strip() for line in f] assert compressed_lines == uncompressed_lines
def shard_dataset(dataset_path: str, output_directory: str): print(f"Reading data from {dataset_path}") with open_compressed(cached_path(dataset_path)) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] shards = [{"data": []} for _ in range(NUM_SHARDS)] i = 0 for article in dataset: for paragraph_json in article["paragraphs"]: context = paragraph_json["context"] for question_answer in paragraph_json["qas"]: shard_num = i % NUM_SHARDS shard = shards[shard_num] shard["data"].append( {"paragraphs": [{"context": context, "qas": [question_answer]}]} ) i += 1 for i, shard in enumerate(shards): print(f"Shard {i}: {len(shard['data'])} question answers") for i, shard in enumerate(shards): shard_file_path = os.path.join(output_directory, f"shard_{i}.json") print(f"Writing shard {i} to {shard_file_path}") with open(shard_file_path, "w") as shard_file: json.dump(shard, shard_file)
def _direct_read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading the dataset:") logger.info("Reading file at %s", file_path) with open_compressed(file_path) as dataset_file: dataset = json.load(dataset_file) for json_obj in dataset: instance = self._item_to_instance(self.json_to_item(json_obj)) if instance is not None: yield instance if self._save_elasticsearch_cache: with FileLock("cache.lock"): update_required = False with open(QUERIES_CACHE_PATH, "r", encoding="utf8") as f: recent_queries_cache = json.load(f) if self._queries_cache != recent_queries_cache: update_required = True self._queries_cache.update(recent_queries_cache) if update_required: with open(QUERIES_CACHE_PATH, "w", encoding="utf8") as f: json.dump(self._queries_cache, f) f.flush()
def _direct_read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open_compressed(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset") yielded_question_count = 0 questions_with_more_than_one_instance = 0 for article in dataset: for paragraph_json in article["paragraphs"]: context, context_offset = standardize_text_simple( paragraph_json["context"], output_offset=True) for question_answer in paragraph_json["qas"]: answers = [ standardize_text_simple(answer_json["text"]) for answer_json in question_answer["answers"] ] # Just like huggingface, we only use the first answer for training. is_impossible = question_answer.get("is_impossible", None) if len(answers) > 0: first_answer_start_offset = int( question_answer["answers"][0]["answer_start"] + context_offset) else: if is_impossible: answers.append("") first_answer_start_offset = None instances = self.make_instances( standardize_text_simple(question_answer["question"]), context, answers, first_answer_start_offset, question_answer.get("is_impossible", None), question_answer.get("id", None), "is_boolq" in question_answer, ) instances_yielded = 0 for instance in instances: yield instance instances_yielded += 1 if instances_yielded > 1: questions_with_more_than_one_instance += 1 yielded_question_count += 1 if questions_with_more_than_one_instance > 0: logger.info( "%d (%.2f%%) questions have more than one instance", questions_with_more_than_one_instance, 100 * questions_with_more_than_one_instance / yielded_question_count, )
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open_compressed(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset") yielded_question_count = 0 questions_with_more_than_one_instance = 0 for article in dataset: for paragraph_json in article["paragraphs"]: context = paragraph_json["context"] cached_tokenized_context = self._tokenize_context(context) for question_answer in self.shard_iterable( paragraph_json["qas"]): answers = [ answer_json["text"] for answer_json in question_answer["answers"] ] # Just like huggingface, we only use the first answer for training. if len(answers) > 0: first_answer_offset = int( question_answer["answers"][0]["answer_start"]) else: first_answer_offset = None instances = self.make_instances( question_answer.get("id", None), question_answer["question"], answers, context, first_answer_offset=first_answer_offset, always_add_answer_span=True, is_training=True, cached_tokenized_context=cached_tokenized_context, ) instances_yielded = 0 for instance in instances: yield instance instances_yielded += 1 if instances_yielded > 1: questions_with_more_than_one_instance += 1 yielded_question_count += 1 if questions_with_more_than_one_instance > 0: logger.info( "%d (%.2f%%) questions have more than one instance", questions_with_more_than_one_instance, 100 * questions_with_more_than_one_instance / yielded_question_count, )
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading the dataset:") logger.info("Reading file at %s", file_path) with open_compressed(file_path) as dataset_file: for line in dataset_file: item = json.loads(line) question = item["question"] if self._is_twenty_questions: subject = item["subject"] original_question = question question = question.replace(" it ", " " + subject + " ") question = question.replace(" it?", " " + subject) if question.endswith(" it"): question = question[: -len("it")] + subject if question.startswith("it "): question = subject + question[len("it")] if original_question == question: continue question = question.replace("?", "") question = question.replace("\t", "") question: str = question if question.endswith("?") else question + "?" context: str = item[self._context_key] if self._context_key in item else None answer: Optional[bool] = ( item[self._answer_key] if self._answer_key in item else None ) if not self._is_training or answer is not None: instance = self.text_to_instance( question, context, answer, ) yield instance
def _load_pretrained_model(filename: str) -> fasttext.FastText: with tempfile.NamedTemporaryFile("wb") as dc_file: with open_compressed(filename, "rb", encoding=None) as fp: shutil.copyfileobj(fp, dc_file) model = fasttext.load_model(dc_file.name) return model