Exemplo n.º 1
0
    def test_open_compressed(self):
        uncompressed_file = self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt"
        with open_compressed(uncompressed_file) as f:
            uncompressed_lines = [line.strip() for line in f]

        for suffix in ["bz2", "gz"]:
            compressed_file = f"{uncompressed_file}.{suffix}"
            with open_compressed(compressed_file) as f:
                compressed_lines = [line.strip() for line in f]
            assert compressed_lines == uncompressed_lines
Exemplo n.º 2
0
def shard_dataset(dataset_path: str, output_directory: str):
    print(f"Reading data from {dataset_path}")
    with open_compressed(cached_path(dataset_path)) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json["data"]

    shards = [{"data": []} for _ in range(NUM_SHARDS)]

    i = 0
    for article in dataset:
        for paragraph_json in article["paragraphs"]:
            context = paragraph_json["context"]
            for question_answer in paragraph_json["qas"]:
                shard_num = i % NUM_SHARDS
                shard = shards[shard_num]
                shard["data"].append(
                    {"paragraphs": [{"context": context, "qas": [question_answer]}]}
                )
                i += 1

    for i, shard in enumerate(shards):
        print(f"Shard {i}: {len(shard['data'])} question answers")

    for i, shard in enumerate(shards):
        shard_file_path = os.path.join(output_directory, f"shard_{i}.json")
        print(f"Writing shard {i} to {shard_file_path}")
        with open(shard_file_path, "w") as shard_file:
            json.dump(shard, shard_file)
Exemplo n.º 3
0
    def _direct_read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading the dataset:")
        logger.info("Reading file at %s", file_path)
        with open_compressed(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        for json_obj in dataset:
            instance = self._item_to_instance(self.json_to_item(json_obj))
            if instance is not None:
                yield instance

        if self._save_elasticsearch_cache:
            with FileLock("cache.lock"):
                update_required = False
                with open(QUERIES_CACHE_PATH, "r", encoding="utf8") as f:
                    recent_queries_cache = json.load(f)
                    if self._queries_cache != recent_queries_cache:
                        update_required = True
                        self._queries_cache.update(recent_queries_cache)
                if update_required:
                    with open(QUERIES_CACHE_PATH, "w", encoding="utf8") as f:
                        json.dump(self._queries_cache, f)
                        f.flush()
Exemplo n.º 4
0
    def _direct_read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open_compressed(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]
        logger.info("Reading the dataset")
        yielded_question_count = 0
        questions_with_more_than_one_instance = 0
        for article in dataset:
            for paragraph_json in article["paragraphs"]:
                context, context_offset = standardize_text_simple(
                    paragraph_json["context"], output_offset=True)
                for question_answer in paragraph_json["qas"]:
                    answers = [
                        standardize_text_simple(answer_json["text"])
                        for answer_json in question_answer["answers"]
                    ]

                    # Just like huggingface, we only use the first answer for training.
                    is_impossible = question_answer.get("is_impossible", None)
                    if len(answers) > 0:
                        first_answer_start_offset = int(
                            question_answer["answers"][0]["answer_start"] +
                            context_offset)
                    else:
                        if is_impossible:
                            answers.append("")
                        first_answer_start_offset = None

                    instances = self.make_instances(
                        standardize_text_simple(question_answer["question"]),
                        context,
                        answers,
                        first_answer_start_offset,
                        question_answer.get("is_impossible", None),
                        question_answer.get("id", None),
                        "is_boolq" in question_answer,
                    )
                    instances_yielded = 0
                    for instance in instances:
                        yield instance
                        instances_yielded += 1
                    if instances_yielded > 1:
                        questions_with_more_than_one_instance += 1
                    yielded_question_count += 1

        if questions_with_more_than_one_instance > 0:
            logger.info(
                "%d (%.2f%%) questions have more than one instance",
                questions_with_more_than_one_instance,
                100 * questions_with_more_than_one_instance /
                yielded_question_count,
            )
Exemplo n.º 5
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open_compressed(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json["data"]
        logger.info("Reading the dataset")
        yielded_question_count = 0
        questions_with_more_than_one_instance = 0
        for article in dataset:
            for paragraph_json in article["paragraphs"]:
                context = paragraph_json["context"]
                cached_tokenized_context = self._tokenize_context(context)
                for question_answer in self.shard_iterable(
                        paragraph_json["qas"]):
                    answers = [
                        answer_json["text"]
                        for answer_json in question_answer["answers"]
                    ]

                    # Just like huggingface, we only use the first answer for training.
                    if len(answers) > 0:
                        first_answer_offset = int(
                            question_answer["answers"][0]["answer_start"])
                    else:
                        first_answer_offset = None

                    instances = self.make_instances(
                        question_answer.get("id", None),
                        question_answer["question"],
                        answers,
                        context,
                        first_answer_offset=first_answer_offset,
                        always_add_answer_span=True,
                        is_training=True,
                        cached_tokenized_context=cached_tokenized_context,
                    )
                    instances_yielded = 0
                    for instance in instances:
                        yield instance
                        instances_yielded += 1
                    if instances_yielded > 1:
                        questions_with_more_than_one_instance += 1
                    yielded_question_count += 1

        if questions_with_more_than_one_instance > 0:
            logger.info(
                "%d (%.2f%%) questions have more than one instance",
                questions_with_more_than_one_instance,
                100 * questions_with_more_than_one_instance /
                yielded_question_count,
            )
Exemplo n.º 6
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading the dataset:")
        logger.info("Reading file at %s", file_path)
        with open_compressed(file_path) as dataset_file:
            for line in dataset_file:
                item = json.loads(line)
                question = item["question"]

                if self._is_twenty_questions:
                    subject = item["subject"]
                    original_question = question
                    question = question.replace(" it ", " " + subject + " ")
                    question = question.replace(" it?", " " + subject)
                    if question.endswith(" it"):
                        question = question[: -len("it")] + subject
                    if question.startswith("it "):
                        question = subject + question[len("it")]
                    if original_question == question:
                        continue
                    question = question.replace("?", "")
                    question = question.replace("\t", "")

                question: str = question if question.endswith("?") else question + "?"
                context: str = item[self._context_key] if self._context_key in item else None
                answer: Optional[bool] = (
                    item[self._answer_key] if self._answer_key in item else None
                )

                if not self._is_training or answer is not None:
                    instance = self.text_to_instance(
                        question,
                        context,
                        answer,
                    )
                    yield instance
Exemplo n.º 7
0
 def _load_pretrained_model(filename: str) -> fasttext.FastText:
     with tempfile.NamedTemporaryFile("wb") as dc_file:
         with open_compressed(filename, "rb", encoding=None) as fp:
             shutil.copyfileobj(fp, dc_file)
         model = fasttext.load_model(dc_file.name)
     return model