Пример #1
0
def run_fusion_extractive_abstractive(extractive_output, abstractive_output,
                                      ext_abs_fusion_outfile,
                                      ext_abs_fusion_config):
    ext_abs_data_raw, metadata = load_ext_abs_score_data(
        extractive_output, abstractive_output)
    ext_abs_data = torch.FloatTensor(ext_abs_data_raw)
    parameters = {
        k: torch.FloatTensor(v)
        for k, v in ext_abs_fusion_config["parameters"].items()
    }
    ext_abs_logits = F.linear(ext_abs_data, **parameters).squeeze(-1).tolist()

    with jsonlines.open(extractive_output, "r") as reader_outputs, \
            jsonlines.open(ext_abs_fusion_outfile, "w") as ofwriter:
        for e, q, logit, proposed_answers, scores, best_span_index in zip(
                reader_outputs, metadata["questions"], ext_abs_logits,
                metadata['proposed_answers'], ext_abs_data_raw,
                metadata['ext_best_span_idx']):
            assert q == e["raw_question"]
            assert proposed_answers[0] == e["answers"][best_span_index]
            # 0 is extractive class, 1 is abstractive class
            decision = int(logit > 0)
            e["reader_scores"] = [scores[decision]]
            e["answers"] = [proposed_answers[decision]]
            if decision:  # abstractive does not contain span info
                del e["passages"]
                del e["char_offsets"]
            else:
                e["passages"] = [e["passages"][best_span_index]]
                e["char_offsets"] = [e["char_offsets"][best_span_index]]
            ofwriter.write(e)
Пример #2
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("host", help="Elasticsearch host.")
    parser.add_argument("-p",
                        "--port",
                        default=9200,
                        help="port, default is 9200",
                        type=int)

    parser.add_argument("--question_paths",
                        nargs="+",
                        type=str,
                        required=True,
                        help="Path to the questions.")

    args = parser.parse_args()
    host = args.host
    port = args.port

    for path in args.question_paths:
        updated_questions = list()
        resolved_contexts = []
        raw_contexts = []
        f_context = partial(query_es_bulk, host=host, port=port)

        with jsonlines.open(path) as reader:
            questions = list(reader)

            for result in tqdm(Pool().imap(f_context, questions),
                               total=len(questions)):
                resolved_contexts.append(result)

            for question, per_choice_contexts in zip(questions,
                                                     resolved_contexts):
                raw_context = {}
                q_updated = question.copy()

                for choice, contexts in zip(question["question"]["choices"],
                                            per_choice_contexts):
                    choice["para"] = " ".join(
                        [c["hit"]["text"] for c in contexts])
                    raw_context[choice["label"]] = contexts

                updated_questions.append(q_updated)
                raw_contexts.append({question["id"]: raw_context})

        base_dir = os.path.dirname(path)
        base = os.path.basename(path)
        name = os.path.splitext(base)[0]

        with jsonlines.open(os.path.join(base_dir, name + "_with_hits.jsonl"),
                            "w") as writer:
            writer.write_all(raw_contexts)

        with jsonlines.open(os.path.join(base_dir, name + "_with_para.jsonl"),
                            "w") as writer:
            writer.write_all(updated_questions)
def div_raw(rot_seq: Sequence[List[List[object]]], prob_train: float,
            train_fn: str, dev_fn: str):
    with jsonlines.open(train_fn, mode='w') as writer_t:
        with jsonlines.open(dev_fn, mode='w') as writer_d:
            for s in rot_seq:
                if prob_train < random.random():
                    writer_t.write(s)
                else:
                    writer_d.write(s)
Пример #4
0
    def extract_dataset(infile, outfile, negative_X=1, golden_passages=None):
        if not golden_passages:
            golden_passages = get_golden_passages(infile)

        # too slow
        print("Precomputing negatives...\n")
        all_negative_passages = list(set(range(total_length)) - golden_passages)

        def get_random_negative():
            return random.choice(all_negative_passages)

        print("Constructing dataset...\n")

        total_processed_examples = 0
        total_created_examples = 0
        with jsonlines.open(outfile, mode='w') as writer:
            for example_idx, example in enumerate(jsonlines.open(infile)):
                if not example['is_mapped'] or not example['contexts']['positive_ctx'] in golden_passages:
                    continue

                total_processed_examples += 1
                # write positive
                title, passage = para_db.get_doc_text(example['contexts']['positive_ctx'],
                                                      ['raw_document_title', 'raw_paragraph_context'])
                total_created_examples += 1
                writer.write({
                    "id": example['example_id'],
                    "title": title,
                    "psg": passage,
                    "label": 0,
                })

                # assert example['title'] == title, f"Titles do not match {example['title']} /=/ {title}"

                # write negatives
                raw_negatives_ids = [get_random_negative() for _ in range(negative_X)]
                raw_negative_titles, raw_negatives = [], []
                for negative_id in raw_negatives_ids:
                    title, text = para_db.get_doc_text(negative_id,
                                                       columns=["raw_document_title", "raw_paragraph_context"])
                    raw_negative_titles.append(title)
                    raw_negatives.append(text)
                for n_id, n_title, n_psg in zip(raw_negatives_ids, raw_negative_titles, raw_negatives):
                    total_created_examples += 1
                    writer.write({
                        "id": n_id,
                        "title": n_title,
                        "psg": n_psg,
                        "label": 1,
                    })
                if total_processed_examples % 2000 == 0 and total_processed_examples > 0:
                    print(f"Processed {total_processed_examples} examples")
                    print(f"Created {total_created_examples} examples")
        print(f"Total processed {total_processed_examples} examples")
        print(f"Total created {total_created_examples} examples")
Пример #5
0
def extract_predictions(reader_output, outfile):
    with jsonlines.open(reader_output, mode="r") as reader:
        with jsonlines.open(outfile, mode='w') as writer:
            logging.info("Extracting answers")
            for e in reader:
                pred_answer = e['answers'][argmax(e['reader_scores'])]
                prediction = {
                    "question": e['raw_question'],
                    "prediction": pred_answer
                }
                writer.write(prediction)
Пример #6
0
def convert(filepath):
    fin = open(filepath, 'r', encoding='utf-8', newline='\n', errors='ignore')
    lines = fin.readlines()
    fin.close()

    all_data = []

    with tqdm(total=len(lines)) as pbar:
        for i in range(0, len(lines), 3):
            text_left, _, text_right = [
                s.lower().strip() for s in lines[i].partition("$T$")
            ]
            target_phrase = lines[i + 1].lower().strip()
            polarity = lines[i + 2].strip()

            data = {}
            data['text_left'] = text_left
            data['text_right'] = text_right
            data['target_phrase'] = target_phrase
            data['polarity'] = polarity
            all_data.append(data)
            pbar.update(3)

    with jsonlines.open(filepath + '.jsonl', 'w') as writer:
        writer.write_all(all_data)
Пример #7
0
def load_raw_test_toefl():
    pos_tags = {}
    with jsonlines.open(f'{TOEFL_TEST}/toefl_skll_test_features_no_labels/all_pos/P.jsonlines') as reader:
        for obj in reader:
            txt_id, sent_id, word_id, word = obj['id'].split("_")
            pos_tag = obj['x']['stanford_postag']
            pos_tags[(txt_id, int(sent_id), int(word_id))] = pos_tag

    toefl_test_sents = {}  # (essay, sentence_id) mapping to sentence
    raw_test_toefl = []

    for filename in os.listdir(f'{TOEFL_TEST}/essays'):
        fileid = filename.split('.')[0]
        with open(f'{TOEFL_TEST}/essays/{filename}') as f:
            lines = [line.rstrip() for line in f]
            for i in range(len(lines)):
                tok_sent = lines[i].split()
                sentence = lines[i]
                toefl_test_sents[(fileid, i + 1)] = sentence
                for j in range(len(tok_sent)):
                    if (fileid, i+1, j+1) in pos_tags:
                        pos_tag = pos_tags[(fileid, i+1, j+1)]
                        if pos_tag.startswith('V'):
                            raw_test_toefl.append([sentence, j, f'{fileid}_{i+1}_{j+1}'])  # sentence, verb_idx, key
    return raw_test_toefl, toefl_test_sents
Пример #8
0
def get_golden_passages(data_source):
    dataset = jsonlines.open(data_source)
    try:
        r = set([ex['contexts']['positive_ctx'] for ex in dataset if ex['is_mapped']])
    finally:
        dataset.close()
    return r
Пример #9
0
def load_raw_train_toefl():
    pos_and_label = {}
    with jsonlines.open(f'{TOEFL_TRAIN}/toefl_skll_train_features/all_pos/P.jsonlines') as reader:
        for obj in reader:
            txt_id, sent_id, word_id, word = obj['id'].split("_")
            pos_tag = obj['x']['stanford_postag']
            label = obj['y']
            pos_and_label[(txt_id, int(sent_id), int(word_id))] = (pos_tag, label)

    toefl_train_sents = {}  # (essay, sentence_id) mapping to sentence
    raw_train_toefl = []

    for filename in os.listdir(f'{TOEFL_TRAIN}/essays'):
        fileid = filename.split('.')[0]
        with open(f'{TOEFL_TRAIN}/essays/{filename}') as f:
            lines = [line.rstrip() for line in f]
            for i in range(len(lines)):
                tok_sent = lines[i].split()
                sentence = lines[i].replace('M_', '')
                toefl_train_sents[(fileid, i + 1)] = sentence
                for j in range(len(tok_sent)):
                    if (fileid, i+1, j+1) in pos_and_label:
                        pos_tag, label = pos_and_label[(fileid, i+1, j+1)]
                        if pos_tag.startswith('V'):
                            raw_train_toefl.append([sentence, j, int(label)])  # sentence, verb_idx, label
    return raw_train_toefl, toefl_train_sents
Пример #10
0
    def post_training(
        self,
        selected_model_path,
        selected_model_filename,
        test_data_loader,
        selected_model_dev_stats,
        time_training_elapsed_mins,
    ):
        logger.info("loading selected model from training: {}".format(
            selected_model_path))
        self.model.load_state_dict(torch.load(selected_model_path))

        logger.info("evaluating selected model on test-set")
        # set model into evaluation mode (cf. https://pytorch.org/docs/stable/nn.html#torch.nn.Module.train)
        self.model.eval()

        # do the actual evaluation
        filepath_stats_prefix = os.path.join(self.opt.experiment_path,
                                             "statistics",
                                             selected_model_filename)
        os.makedirs(filepath_stats_prefix, exist_ok=True)
        if not filepath_stats_prefix.endswith("/"):
            filepath_stats_prefix += "/"

        test_stats = self._evaluate(test_data_loader,
                                    get_examples=True,
                                    basepath=filepath_stats_prefix)
        test_snem = test_stats[self.opt.snem]

        self.evaluator.print_stats(test_stats, "evaluation on test-set")

        # save dev and test results
        experiment_results = {}
        experiment_results["test_stats"] = self.get_serializable_stats(
            test_stats)
        experiment_results["dev_stats"] = self.get_serializable_stats(
            selected_model_dev_stats)
        experiment_results["options"] = self.get_serializable_opts()
        experiment_results[
            "time_training_elapsed_mins"] = time_training_elapsed_mins

        experiment_results_path = os.path.join(self.opt.experiment_path,
                                               "experiment_results.jsonl")
        with jsonlines.open(experiment_results_path, "w") as writer:
            writer.write(experiment_results)

        # save confusion matrices
        test_confusion_matrix = test_stats["confusion_matrix"]

        create_save_plotted_confusion_matrix(
            test_confusion_matrix,
            expected_labels=self.sorted_expected_label_values,
            basepath=filepath_stats_prefix,
        )

        logger.info("finished execution of this run. exiting.")

        # print snem pad_value to stdout, for the controller to parse it
        print(test_snem)
Пример #11
0
def load_pos_annotations(ex_features, directory):
    with jsonlines.open(f"{directory}/P.jsonlines") as reader:
        for obj in reader:
            txt_id, sent_id, word_id = obj['id'].split("_")
            stanford_postag = obj['x']['postag']
            ex_features[(txt_id, sent_id,
                         int(word_id))]['stanford_postag'] = stanford_postag
    return ex_features
def original_sentences(filename: str) -> Sequence[List[str]]:
    stored_cache: Set = set()
    with jsonlines.open(filename) as reader:
        for sentence in tqdm(reader, total=2230373):
            key = '-:-'.join(sentence)
            if key in stored_cache:
                continue
            stored_cache.add(key)
            yield sentence
Пример #13
0
def run_score_aggregation(outputs, aggregation_config, aggregation_outfile):
    pipeline_data, metadata = load_pipeline_data(outputs, aggregation_config)

    pipeline_data = torch.FloatTensor(pipeline_data).transpose(-1, -2)
    parameters = {
        k: torch.FloatTensor(v)
        for k, v in aggregation_config["parameters"].items()
    }
    aggregated_logits = F.linear(pipeline_data,
                                 **parameters).squeeze(-1).tolist()

    with jsonlines.open(outputs["reader_output"], "r") as reader_outputs, \
            jsonlines.open(aggregation_outfile, "w") as ofwriter:
        for e, q, logits in zip(reader_outputs, metadata["questions"],
                                aggregated_logits):
            assert q == e["raw_question"]
            e["reader_scores"] = logits
            ofwriter.write(e)
Пример #14
0
    def get_example_list(self):
        with open(self.datafile, encoding="utf-8") as f:
            num_lines = sum(1 for line in f)
        examples = []
        with jsonlines.open(self.datafile, "r") as fd:
            for idx, sample in tqdm(enumerate(fd),
                                    total=num_lines):  # TODO: parallelize?
                if self.is_training:
                    examples += FusionInDecoderDataset.process_sample(
                        sample,
                        database=self.database,
                        tokenizer=self.tokenizer,
                        max_input_length=self.max_len,
                        context_size=self.context_length,
                        include_doc_masks=self.include_passage_masks,
                        include_golden_passage=self.include_golden_passage,
                        preprocessing_truncation=self.preprocessing_truncation,
                        one_answer_per_question=self.one_answer_per_question,
                        use_only_human_answer=self.use_only_human_answer)
                else:
                    # Do not use same question with multiple answers in validation
                    examples += [
                        FusionInDecoderDataset.process_sample(
                            sample,
                            database=self.database,
                            tokenizer=self.tokenizer,
                            max_input_length=self.max_len,
                            context_size=self.context_length,
                            include_doc_masks=self.include_passage_masks,
                            include_golden_passage=False,
                            preprocessing_truncation=self.
                            preprocessing_truncation)[0]
                    ]
                if idx == 0:
                    logging.info("Example of input formats:")
                    src_example1 = " ".join(
                        self.tokenizer.convert_ids_to_tokens(
                            examples[0]["sources"][0]))
                    src_example2 = " ".join(
                        self.tokenizer.convert_ids_to_tokens(
                            examples[0]["sources"][1]))
                    if len(examples[0]["target"]) > 1:
                        possible_target = examples[0]["target"]
                        if type(possible_target) == list:
                            possible_target = possible_target[0]
                        target_example = " ".join(
                            self.tokenizer.convert_ids_to_tokens(
                                possible_target))
                    logging.info("inputs 1:")
                    logging.info(src_example1)
                    logging.info("inputs 2:")
                    logging.info(src_example2)
                    if len(examples[0]["target"]) > 1:
                        logging.info("target:")
                        logging.info(target_example)

        return examples
Пример #15
0
def load_wordnet_annotations(ex_features, directory):
    with jsonlines.open(f"{directory}/WordNet.jsonlines") as reader:
        for obj in reader:
            txt_id, sent_id, word_id = obj['id'].split("_")
            wn_vector = np.zeros(26)
            for annotation in obj['x']:
                index = int(annotation.split("_")[-1])
                wn_vector[index - 1] = 1
            ex_features[(txt_id, sent_id, int(word_id))]['wordnet'] = wn_vector
    return ex_features
Пример #16
0
def get_experiment_result_detailed(experiment_path):
    experiment_results_path = os.path.join(experiment_path, "experiment_results.jsonl")
    try:
        with jsonlines.open(experiment_results_path, "r") as reader:
            lines = []
            for line in reader:
                lines.append(line)
            assert len(lines) == 1
        return lines[0]
    except FileNotFoundError:
        return None
Пример #17
0
def load_topic_annotations(ex_features, directory):
    with jsonlines.open(f"{directory}/T.jsonlines") as reader:
        for obj in reader:
            txt_id, sent_id, word_id = obj['id'].split("_")
            lda_vector = np.zeros(100)
            for annotation in obj['x']:
                index = int(annotation.split("-")[-1])
                lda_vector[index - 1] = float(obj['x'][annotation])
            ex_features[(txt_id, sent_id,
                         int(word_id))]['topic_lda'] = lda_vector
    return ex_features
Пример #18
0
def load_cbias_annotations(ex_features, directory):
    with jsonlines.open(f"{directory}/C-BiasUp.jsonlines") as reader:
        for obj in reader:
            txt_id, sent_id, word_id = obj['id'].split("_")
            ccb_vector = np.zeros(17)
            for annotation in obj['x']:
                index = int(annotation.split("-")[-1])
                ccb_vector[index - 1] = int(obj['x'][annotation])
            ex_features[(txt_id, sent_id,
                         int(word_id))]['cbiasup'] = ccb_vector

    with jsonlines.open(f"{directory}/C-BiasDown.jsonlines") as reader:
        for obj in reader:
            txt_id, sent_id, word_id = obj['id'].split("_")
            ccb_vector = np.zeros(17)
            for annotation in obj['x']:
                index = int(annotation.split("-")[-1])
                ccb_vector[index - 1] = int(obj['x'][annotation])
            ex_features[(txt_id, sent_id,
                         int(word_id))]['cbiasdown'] = ccb_vector
    return ex_features
Пример #19
0
def load_label_ul_annotations(ex_features, directory, include_labels=True):
    with jsonlines.open(f"{directory}/UL.jsonlines") as reader:
        for obj in reader:
            txt_id, sent_id, word_id = obj['id'].split("_")
            ex_id = (txt_id, sent_id, int(word_id))
            (_, ul), = obj["x"].items()
            ex_features[ex_id] = {}
            ex_features[ex_id]['ul'] = ul
            ex_features[ex_id]['id'] = ex_id
            if include_labels:
                label = int(obj['y'])
                ex_features[ex_id]['label'] = label
    return ex_features
Пример #20
0
 def _read_json_file(self, file: Path) -> List:
     with open(file, mode='r') as f:
         total_count = sum([1 for _ in f])
     data_json = []
     with jsonlines.open(file) as reader:
         for obj in self.iterate(reader,
                                 f'Reading file {file.name}',
                                 total_count=total_count):
             if 'field' in obj:
                 data_json.append(obj['field'])
                 data_json[-1]['entityId'] = obj['entityId']
             else:
                 data_json.append(obj)
     return data_json
Пример #21
0
def download(version, directory: str, dataset: list):
    if version["version"] != "Mediapipe":
        raise ValueError("Running this addon version is not implemented")

    poses_dir = path.join(directory, "poses")
    makedir(poses_dir)

    Docker.verify_image_exists(DOCKER_NAME)

    should_cleanup = False
    while True:
        existing = {path.join(poses_dir, di) for di in os.listdir(poses_dir)}
        missing_data = []
        for datum in dataset:
            datum["pose_dir"] = path.join(poses_dir, datum["id"])
            if datum["pose_dir"] not in existing:
                missing_data.append(datum)

        # Break when finished
        if len(missing_data) == 0:
            break

        print(missing_data)

        should_cleanup = True
        print("Done",
              len(dataset) - len(missing_data), "/", len(dataset), "tasks")

        # should_cleanup = False
        # for datum in tqdm(missing_data):
        #     pose_video(datum)

        distributed.clear_tasks()
        distributed.kill_slaves()
        clean_dockers()
        distributed.spawn_workers().flower()
        distributed.run(pose_video, missing_data[:50000])

    if should_cleanup:
        distributed.kill_slaves()
        clean_dockers()

    with jsonlines.open(path.join(directory, "index.jsonl"),
                        mode='w') as writer:
        for datum in tqdm(dataset):
            writer.write({
                "id": datum["id"],
                "poses": get_directory_hands(datum["pose_dir"])
            })
def process_source_json(args):
    # Map story id and position to dicts so can be extracted and analysed separately.
    story_map = defaultdict(lambda: list())

    with jsonlines.open(args["source_json"], mode='r') as reader:
        for i, json_obj in enumerate(reader):
            story_id = json_obj["metadata"]["story_id"]
            abs_pos = json_obj["metadata"]["absolute_position"]
            rel_pos = json_obj["metadata"]["relative_position"]

            attribute = json_obj[args["attribute"]]

            story_map[story_id].append((abs_pos, rel_pos, attribute))

    return story_map
Пример #23
0
def download(directory: str, version, module_path: str, dataset=None):
    makedir(directory)
    version_dir = path.join(directory, version["version"])
    index_path = path.join(version_dir, 'index.jsonl')
    if not exists(version_dir) or not exists(index_path):
        makedir(version_dir)
        module = modular_import("module", module_path)
        if dataset is None:
            module.download(version, version_dir)
        else:
            module.download(version, version_dir, dataset)

    data = list(jsonlines.open(index_path))

    return version_dir, data
Пример #24
0
def run_reader_extractive(checkpointDict, reader_output, reranker_output):
    ext_reader_cfg = config["reader"]["extractive"]["config"]
    cache_dir = config["transformers_cache"]

    checkpointDict["config"][
        "cache"] = cache_dir  # overwrite the old loaded cache path
    model = Reader(checkpointDict["config"], initPretrainedWeights=False)
    Checkpoint.loadModel(model, checkpointDict, config["device"])

    if "multi_gpu" in ext_reader_cfg and ext_reader_cfg[
            "multi_gpu"] and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
        logging.info("DataParallel active!")

    extractor = AnswerExtractor(model, config["device"])
    extractor.model.eval()
    tokenizer = AutoTokenizer.from_pretrained(
        checkpointDict["config"]['tokenizer_type'],
        cache_dir=cache_dir,
        use_fast=True)
    database = get_database_path()
    database = PassDatabase(database)
    with ReaderDataset(
            reranker_output, tokenizer, database, ext_reader_cfg["batch_size"],
            checkpointDict["config"]['include_doc_title']) as dataset:
        logging.info(f"Extracting top k answers scores")
        res = {}
        for i, (query, answers, scores, passageIds, charOffsets) in \
                tqdm(enumerate(extractor.extract(dataset,
                                                 ext_reader_cfg["top_k_answers"],
                                                 ext_reader_cfg["max_tokens_for_answer"])),
                     total=len(dataset)):
            res[i] = {
                "raw_question": query,
                "answers": answers,
                "reader_scores": scores,
                "passages": passageIds,
                "char_offsets": charOffsets
            }

        with jsonlines.open(reader_output, "w") as wF:
            for _, record in res.items():
                wF.write(record)
def download_firebase_collection(args):
    print(f"Download Firebase collection: {args}")

    collection_data = []

    cred = credentials.Certificate(args["firebase_key_path"])
    firebase_admin.initialize_app(cred)
    db = firestore.client()

    collection_ref = db.collection(args['collection_name'])
    docs = collection_ref.stream()

    for doc in docs:
        doc_dict = {}
        doc_dict["id"] = doc.id
        doc_dict["document"] = doc.to_dict()
        doc_dict["collection"] = args['collection_name']
        collection_data.append(doc_dict)

    with jsonlines.open(args['output_file'], mode='w') as writer:
        for d in collection_data:
            writer.write(d)
Пример #26
0
def read_results(test_number: str, policy_name: str) -> pd.DataFrame:
    """
    Reads in a results file, returns a dataframe including ratios
    """
    path = 'results/{}-{}'.format(test_number, policy_name)

    df = pd.DataFrame(columns=[
        'utilisation', 'opt_utilisation', 'oblivious_utilisation',
        'sequence_type', 'graphs'
    ])
    with jsonlines.open(path) as f:
        for result in f:
            sequence_type = result['sequence_type']
            graphs = '.'.join(result['graphs'])
            for i in range(len(result['utilisations'])):
                df = df.append(
                    {
                        'utilisation':
                        result['utilisations'][i],
                        'opt_utilisation':
                        result['opt_utilisations'][i],
                        'oblivious_utilisation':
                        result['oblivious_utilisations'][i],
                        'sequence_type':
                        sequence_type,
                        'graphs':
                        graphs
                    },
                    ignore_index=True)

    # to easily separate experiments in plots
    df['test_number'] = test_number
    df['policy_name'] = policy_name
    # calculate ratios
    df['ratio'] = df['utilisation'] / df['opt_utilisation']
    df['oblivious_ratio'] = df['oblivious_utilisation'] / df['opt_utilisation']

    return df
Пример #27
0
def read_result(model_id: str, test_id: str, policy_id: str) -> pd.DataFrame:
    """
    Reads in a results file, returns a dataframe including ratios
    """
    path = "results/overfit-{}-{}-{}".format(model_id, test_id, policy_id)

    df = pd.DataFrame(
        columns=['utilisation', 'opt_utilisation', 'oblivious_utilisation'])
    with jsonlines.open(path) as f:
        for result in f:
            for i in range(len(result['utilisations'])):
                df = df.append(
                    {
                        'utilisation':
                        result['utilisations'][i],
                        'opt_utilisation':
                        result['opt_utilisations'][i],
                        'oblivious_utilisation':
                        result['oblivious_utilisations'][i],
                        'action':
                        result['actions'][i]
                    },
                    ignore_index=True)

    # to easily separate experiments in plots
    if len(model_id) > 1:
        x_value = int(model_id[2])
    else:
        x_value = int(model_id)
    df['model_id'] = model_id
    df['x_value'] = x_value
    df['test_id'] = test_id
    df['policy_id'] = policy_id
    # calculate ratios
    df['ratio'] = df['utilisation'] / df['opt_utilisation']
    df['oblivious_ratio'] = df['oblivious_utilisation'] / df['opt_utilisation']

    return df
def main():
    domains = load_domains()
    names = load_entities()

    print("querying:")
    print(f'{len(domains)} domains')
    print(f'{len(names)} entities')
    print()

    # read domains crawled span times
    with open('config_data/domains_crawled_dates.json') as f_in:
        global domains_crawled_dates
        domains_crawled_dates = json.load(f_in)

    for name in names:
        print(name)
        f_name = '_'.join(name.split()) + '.jsonl'
        results = runner(domains, name)
        if results:
            with jsonlines.open(OUTPUT_DIR + "/" + f_name, mode='w') as writer:
                for k, v in results.items():
                    for r in v:
                        writer.write(r)
Пример #29
0
 def dump_results(self, dict_for_dumping, file, is_test):
     if self.is_writer:
         if is_test:
             assert file in self.test_files, f'The dump file {file} is not in our possible files from {self.test_files.keys()}'
         else:
             assert file in self.dev_files, f'The dump file {file} is not in our possible files from {self.dev_files.keys()}'
         if not is_test:
             file = self.dev_files[file]
             file_mode = 'a'
         else:
             file = self.test_files[file]
             file_mode = 'a'
         with jsonlines.open(file, file_mode) as f:
             for json_obj in dict_for_dumping:
                 train_head = json_obj["train_head"]
                 eval_slice = json_obj["eval_slice"]
                 tag = "/".join([train_head, eval_slice])
                 self.tb_writer.add_scalar(
                     tag=tag,
                     scalar_value=json_obj["f1_micro"],
                     global_step=json_obj["global_step"])
                 f.write(json_obj)
     return
def process_source_json(args):
    # Map story id and position to dicts so can be extracted and analysed separately.
    story_id_map = defaultdict(lambda: list())
    positions_map = defaultdict(lambda: list())
    story_ids = []
    absolute_positions = []
    story_ids_and_pos = []
    source_embeddings = []
    target_embeddings = []
    with jsonlines.open(args["source_json"], mode='r') as reader:
        for i, json_obj in enumerate(reader):
            story_id = json_obj["metadata"]["story_id"]
            abs_pos = json_obj["metadata"]["absolute_position"]

            story_id_map[story_id].append(i)
            positions_map[abs_pos].append(i)

            story_ids.append(story_id)
            absolute_positions.append(abs_pos)
            story_ids_and_pos.append(f"{story_id}_{abs_pos}")

            source_embeddings.append(json_obj["source_embeddings"])
            target_embeddings.append(json_obj["target_embeddings"])

    source_embeddings_arr = da.from_array(source_embeddings,
                                          chunks=(1000, 1000))
    target_embeddings_arr = da.from_array(target_embeddings,
                                          chunks=(1000, 1000))

    if args["normalize"]:
        source_embeddings_arr = da.from_array(
            preprocessing.scale(source_embeddings_arr), chunks=(1000, 1000))
        target_embeddings_arr = da.from_array(
            preprocessing.scale(target_embeddings_arr), chunks=(1000, 1000))

    return source_embeddings_arr, target_embeddings_arr, story_ids, np.array(
        absolute_positions), story_ids_and_pos, story_id_map, positions_map