Пример #1
0
 def __iterations_compute(self, x_current, y_current, z_current, mode, coord):
     data_iterations = self.__select_items(coord)
     u = self.__select_u(coord)
     for i in data_iterations:
         d = self.__select_d(mode, y_current, z_current)
         f = self.__select_f(coord, d, i)
         x_next = int(x_current - u * ((d * y_current) / (2 ** i)))
         y_next = int(y_current + ((d * x_current) / (2 ** i)))
         z_next = int(z_current - f)
         x_current, y_current, z_current = x_next, y_next, z_next
     x = decoding(x_current, self.__resolution)
     y = decoding(y_current, self.__resolution)
     z = decoding(z_current, self.__resolution)
     return x, y, z
Пример #2
0
def evaluate(model, criterion, data_loader, file_path, mode):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under args.data_path dir for later submission or evaluation.
    """
    example_all = []
    with open(file_path, "r", encoding="utf-8") as fp:
        for line in fp:
            example_all.append(json.loads(line))
    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)

    model.eval()
    loss_all = 0
    eval_steps = 0
    formatted_outputs = []
    current_idx = 0
    for batch in tqdm(data_loader, total=len(data_loader)):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
        logits = model(input_ids=input_ids)
        mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and(
            (input_ids != 2))
        loss = criterion(logits, labels, mask)
        loss_all += loss.numpy().item()
        probs = F.sigmoid(logits)
        logits_batch = probs.numpy()
        seq_len_batch = seq_len.numpy()
        tok_to_orig_start_index_batch = tok_to_orig_start_index.numpy()
        tok_to_orig_end_index_batch = tok_to_orig_end_index.numpy()
        formatted_outputs.extend(
            decoding(example_all[current_idx:current_idx + len(logits)],
                     id2spo, logits_batch, seq_len_batch,
                     tok_to_orig_start_index_batch,
                     tok_to_orig_end_index_batch))
        current_idx = current_idx + len(logits)
    loss_avg = loss_all / eval_steps
    print("eval loss: %f" % (loss_avg))

    if mode == "predict":
        predict_file_path = os.path.join(args.data_path, 'predictions.json')
    else:
        predict_file_path = os.path.join(args.data_path, 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(
            file_path, predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        raise Exception("wrong mode for eval func")
Пример #3
0
def predict(args, ext_model, cls_model, tokenizer, ext_id2label, cls_id2label):

    ext_model.eval()
    cls_model.eval()

    while True:
        input_text = input("input text: \n")
        if not input_text:
            continue
        if input_text == "quit":
            break

        input_text = input_text.strip().replace(" ", "")
        # processing input text
        encoded_inputs = tokenizer(list(input_text),
                                   is_split_into_words=True,
                                   max_seq_len=args.ext_max_seq_len)
        input_ids = paddle.to_tensor([encoded_inputs["input_ids"]])
        token_type_ids = paddle.to_tensor([encoded_inputs["token_type_ids"]])

        # extract aspect and opinion words
        logits = ext_model(input_ids, token_type_ids=token_type_ids)
        predictions = logits.argmax(axis=2).numpy()[0]
        tag_seq = [ext_id2label[idx] for idx in predictions][1:-1]

        aps = decoding(input_text[:args.ext_max_seq_len - 2], tag_seq)

        # predict sentiment for aspect with cls_model
        results = []
        for ap in aps:
            aspect = ap[0]
            opinion_words = list(set(ap[1:]))
            aspect_text = concate_aspect_and_opinion(input_text, aspect,
                                                     opinion_words)

            encoded_inputs = tokenizer(aspect_text,
                                       text_pair=input_text,
                                       max_seq_len=args.cls_max_seq_len,
                                       return_length=True)
            input_ids = paddle.to_tensor([encoded_inputs["input_ids"]])
            token_type_ids = paddle.to_tensor(
                [encoded_inputs["token_type_ids"]])

            logits = cls_model(input_ids, token_type_ids=token_type_ids)
            prediction = logits.argmax(axis=1).numpy()[0]

            result = {
                "aspect": aspect,
                "opinions": opinion_words,
                "sentiment_polarity": cls_id2label[prediction]
            }
            results.append(result)

        format_print(results)
Пример #4
0
    def predict_ext(self, args):
        ori_test_ds = load_dataset(read_test_file,
                                   data_path=args.test_path,
                                   lazy=False)
        trans_func = partial(convert_example_to_feature_ext,
                             tokenizer=self.tokenizer,
                             label2id=self.ext_label2id,
                             max_seq_len=args.ext_max_seq_len,
                             is_test=True)
        test_ds = copy.copy(ori_test_ds).map(trans_func, lazy=False)
        batch_list = [
            test_ds[idx:idx + args.batch_size]
            for idx in range(0, len(test_ds), args.batch_size)
        ]

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),
            Pad(axis=0,
                pad_val=self.tokenizer.pad_token_type_id,
                dtype="int64"), Stack(dtype="int64")): fn(samples)

        results = []
        for bid, batch_data in enumerate(batch_list):
            input_ids, token_type_ids, seq_lens = batchify_fn(batch_data)
            self.ext_input_handles[0].copy_from_cpu(input_ids)
            self.ext_input_handles[1].copy_from_cpu(token_type_ids)
            self.ext_predictor.run()
            logits = self.ext_output_hanle.copy_to_cpu()

            predictions = logits.argmax(axis=2)
            for eid, (seq_len,
                      prediction) in enumerate(zip(seq_lens, predictions)):
                idx = bid * args.batch_size + eid
                tag_seq = [
                    self.ext_id2label[idx]
                    for idx in prediction[:seq_len][1:-1]
                ]
                text = ori_test_ds[idx]["text"]
                aps = decoding(text[:args.ext_max_seq_len - 2], tag_seq)
                for aid, ap in enumerate(aps):
                    aspect, opinions = ap[0], list(set(ap[1:]))
                    aspect_text = self._concate_aspect_and_opinion(
                        text, aspect, opinions)
                    results.append({
                        "id": str(idx) + "_" + str(aid),
                        "aspect": aspect,
                        "opinions": opinions,
                        "text": text,
                        "aspect_text": aspect_text
                    })
        return results
Пример #5
0
def show_results(resolution=14):
    """
    Files are read in alphabetical order
    1. Coordinate System
    2. Enable
    3. Mode
    4. X Python Values - Compute with numpy
    5. X VHDL Values
    6. Y Python Values - Compute with numpy
    7. Y VHDL Values
    8. Z Python Values - Compute with numpy
    9. Z VHDL Values 
    """
    real_values = create_files_to_simulate()
    sin_python, cos_python, arctan_python, sinh_python, cosh_python, arctanh_python, axes_circular_python, axes_hyperbolic_python, axes_arctanh_python = real_values
    data_output = read_files()
    coord, enable, mode = data_output[0], data_output[1], data_output[2]
    x, y, z = data_output[4], data_output[6], data_output[8]
    sin, cos, arctan = [], [], []
    sinh, cosh, arctanh = [], [], []
    for index in range(len(enable)):
        if enable[index] == 1:  # If the module is enabled
            if coord[
                    index] == 0:  # If the module is configurate in circular coordinate system
                if mode[index] == 0:  # If the module is operating in rotation mode
                    cos.append(decoding(x[index], resolution))
                    sin.append(decoding(y[index], resolution))
                else:  # If the module is operating in vectoring mode
                    arctan.append(rad_to_deg(decoding(z[index], resolution)))
            else:  # If the module is configure in hyperbolic coordinate system
                if mode[index] == 0:  # If the module is operating in rotation mode
                    cosh.append(decoding(x[index], resolution))
                    sinh.append(decoding(y[index], resolution))
                else:  # If the module is operating in vectoring mode
                    arctanh.append(rad_to_deg(decoding(z[index], resolution)))
    plot_results(cos, cos_python, axes_circular_python, 'Cos')
    plot_results(sin, sin_python, axes_circular_python, 'Sin')
    plot_results(arctan, arctan_python, axes_circular_python, 'Arctan')
    plot_results(sinh, sinh_python, axes_hyperbolic_python, 'Sinh')
    plot_results(cosh, cosh_python, axes_hyperbolic_python, 'Cosh')
    plot_results(arctanh, arctanh_python, axes_arctanh_python, 'Arctanh')
Пример #6
0
def evaluate(model, criterion, data_loader, test_loss, file_path, mode):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under args.data_path dir for later submission or evaluation.
    """
    probs_all = None
    seq_len_all = None
    tok_to_orig_start_index_all = None
    tok_to_orig_end_index_all = None
    loss_all = 0
    eval_steps = 0
    for batch in tqdm(data_loader):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
        logits = model(input_ids=input_ids)
        mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and((input_ids != 2))
        loss = criterion((logits, labels, mask))
        loss_all += test_loss(loss).result()
        probs = logits
        if probs_all is None:
            probs_all = probs.numpy()
            seq_len_all = seq_len.numpy()
            tok_to_orig_start_index_all = tok_to_orig_start_index.numpy()
            tok_to_orig_end_index_all = tok_to_orig_end_index.numpy()
        else:
            probs_all = np.append(probs_all, probs.numpy(), axis=0)
            seq_len_all = np.append(seq_len_all, seq_len.numpy(), axis=0)
            tok_to_orig_start_index_all = np.append(
                tok_to_orig_start_index_all,
                tok_to_orig_start_index.numpy(),
                axis=0)
            tok_to_orig_end_index_all = np.append(
                tok_to_orig_end_index_all,
                tok_to_orig_end_index.numpy(),
                axis=0)
    loss_avg = loss_all / eval_steps
    print("eval loss: %f" % (loss_avg))

    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)
    formatted_outputs = decoding(file_path, id2spo, probs_all, seq_len_all,
                                 tok_to_orig_start_index_all,
                                 tok_to_orig_end_index_all)
    if mode == "predict":
        predict_file_path = os.path.join(args.data_path, 'predictions.json')
    else:
        predict_file_path = os.path.join(args.data_path, 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(file_path,
                                                        predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        raise Exception("wrong mode for eval func")
Пример #7
0
def doccano2SA(doccano_file,
               save_ext_dir,
               save_cls_dir,
               splits=[0.8, 0.9],
               is_shuffle=True):
    """
        @Description: Consvert doccano file to data format which is suitable to input to this Application.
        @Param doccano_file: The annotated file exported from doccano labeling platform.
        @Param save_ext_dir: The directory of ext data that you wanna save.
        @Param save_cls_dir: The directory of cls data that you wanna save.
        @Param splits: Whether to split doccano file into train/dev/test, note: Only []/ len(splits)==2 accepted.
        @Param is_shuffle: Whether to shuffle data.
    """
    if not os.path.exists(doccano_file):
        raise ValueError("Please input the correct path of doccano file.")

    if not os.path.exists(save_ext_dir):
        os.makedirs(save_ext_dir)

    if not os.path.exists(save_cls_dir):
        os.makedirs(save_cls_dir)

    if len(splits) != 0 and len(splits) != 2:
        raise ValueError("Only []/ len(splits)==2 accepted for splits.")

    if splits and (splits[0] >= splits[1] or splits[0] >= 1.0 or
                   splits[1] >= 1.0 or splits[0] <= 0. or splits[1] <= 0):
        raise ValueError(
            "Please set correct splits, the element in it should be in (0,1), and splits[1]>splits[0]."
        )

    def label_ext_with_label_term(ext_label, start, end, tag):

        if tag == "Opinion":
            b_tag = "B-Opinion"
            i_tag = "I-Opinion"
        else:
            b_tag = "B-Aspect"
            i_tag = "I-Aspect"

        ext_label[start] = b_tag
        for i in range(start + 1, end):
            ext_label[i] = i_tag

    ext_examples, cls_examples = [], []
    with open(doccano_file, "r", encoding="utf-8") as f:
        raw_examples = f.readlines()
    # start to label for ext and cls data
    for line in raw_examples:
        items = json.loads(line)
        text, label_terms = items["data"], items["label"]
        # label ext data with label_terms
        ext_label = ["O"] * len(text)
        aspect_mapper = {}
        for label_term in label_terms:
            start, end, tag = label_term
            label_ext_with_label_term(ext_label, start, end, tag)
            if tag == "Pos-Aspect":
                aspect_mapper[text[start:end]] = "1"
            elif tag == "Neg-Aspect":
                aspect_mapper[text[start:end]] = "0"
        ext_examples.append((text, " ".join(ext_label)))
        # label cls data
        aps = decoding(text, ext_label)
        for ap in aps:
            aspect, opinions = ap[0], list(set(ap[1:]))
            if aspect not in aspect_mapper:
                continue
            aspect_text = concate_aspect_and_opinion(text, aspect, opinions)
            cls_examples.append((aspect_mapper[aspect], aspect_text, text))

    # index for saving data
    ext_idx = np.arange(len(ext_examples))
    cls_idx = np.arange(len(cls_examples))

    if is_shuffle:
        ext_idx = np.random.permutation(ext_idx)
        cls_idx = np.random.permutation(cls_idx)

    if len(splits) == 0:
        # save ext data
        save_ext_path = os.path.join(save_ext_dir, "doccano.txt")
        save_examples(ext_examples, save_ext_path, ext_idx)
        print(f"\next: save data to {save_ext_path}.")
        # save cls data
        save_cls_path = os.path.join(save_cls_dir, "doccano.txt")
        save_examples(cls_examples, save_cls_path, cls_idx)
        print(f"\ncls: save data to {save_cls_path}.")

    else:
        # save ext data
        eth1, eth2 = int(len(ext_examples) * splits[0]), int(
            len(ext_examples) * splits[1])
        save_ext_train_path = os.path.join(save_ext_dir, "train.txt")
        save_ext_dev_path = os.path.join(save_ext_dir, "dev.txt")
        save_ext_test_path = os.path.join(save_ext_dir, "test.txt")
        save_examples(ext_examples, save_ext_train_path, ext_idx[:eth1])
        save_examples(ext_examples, save_ext_dev_path, ext_idx[eth1:eth2])
        save_examples(ext_examples, save_ext_test_path, ext_idx[eth2:])
        print(f"\next: save train data to {save_ext_train_path}.")
        print(f"ext: save dev data to {save_ext_dev_path}.")
        print(f"ext: save test data to {save_ext_test_path}.")

        # save cls data
        cth1, cth2 = int(len(cls_examples) * splits[0]), int(
            len(cls_examples) * splits[1])
        save_cls_train_path = os.path.join(save_cls_dir, "train.txt")
        save_cls_dev_path = os.path.join(save_cls_dir, "dev.txt")
        save_cls_test_path = os.path.join(save_cls_dir, "test.txt")
        save_examples(cls_examples, save_cls_train_path, cls_idx[:cth1])
        save_examples(cls_examples, save_cls_dev_path, cls_idx[cth1:cth2])
        save_examples(cls_examples, save_cls_test_path, cls_idx[cth2:])
        print(f"\ncls: save train data to {save_cls_train_path}.")
        print(f"cls: save dev data to {save_cls_dev_path}.")
        print(f"cls: save test data to {save_cls_test_path}.")

    # save ext dict
    ext_dict_path = os.path.join(save_ext_dir, "label.dict")
    cls_dict_path = os.path.join(save_cls_dir, "label.dict")
    save_dict(ext_dict_path, "ext")
    save_dict(cls_dict_path, "cls")
    print(f"\next: save dict to {ext_dict_path}.")
    print(f"cls: save dict to {cls_dict_path}.")
Пример #8
0
def predict_ext(args):
    # load dict
    model_name = "skep_ernie_1.0_large_ch"
    ext_label2id, ext_id2label = load_dict(args.ext_label_path)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    ori_test_ds = load_dataset(read_test_file,
                               data_path=args.test_path,
                               lazy=False)
    trans_func = partial(convert_example_to_feature_ext,
                         tokenizer=tokenizer,
                         label2id=ext_label2id,
                         max_seq_len=args.ext_max_seq_len,
                         is_test=True)
    test_ds = copy.copy(ori_test_ds).map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
        Stack(dtype="int64"),
    ): fn(samples)

    test_batch_sampler = paddle.io.BatchSampler(test_ds,
                                                batch_size=args.batch_size,
                                                shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=batchify_fn)
    print("test data loaded.")

    # load ext model
    ext_state_dict = paddle.load(args.ext_model_path)
    ext_model = SkepForTokenClassification.from_pretrained(
        model_name, num_classes=len(ext_label2id))
    ext_model.load_dict(ext_state_dict)
    print("extraction model loaded.")

    ext_model.eval()
    results = []
    for bid, batch_data in enumerate(test_loader):
        input_ids, token_type_ids, seq_lens = batch_data
        logits = ext_model(input_ids, token_type_ids=token_type_ids)

        predictions = logits.argmax(axis=2).numpy()
        for eid, (seq_len, prediction) in enumerate(zip(seq_lens,
                                                        predictions)):
            idx = bid * args.batch_size + eid
            tag_seq = [ext_id2label[idx] for idx in prediction[:seq_len][1:-1]]
            text = ori_test_ds[idx]["text"]
            aps = decoding(text[:args.ext_max_seq_len - 2], tag_seq)
            for aid, ap in enumerate(aps):
                aspect, opinions = ap[0], list(set(ap[1:]))
                aspect_text = concate_aspect_and_opinion(
                    text, aspect, opinions)
                results.append({
                    "id": str(idx) + "_" + str(aid),
                    "aspect": aspect,
                    "opinions": opinions,
                    "text": text,
                    "aspect_text": aspect_text
                })

    return results
Пример #9
0
def evaluate(model, criterion, data_loader, file_path, mode, logger):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under args.data_path dir for later submission or evaluation.
    """
    model.eval()
    probs_all = None
    seq_len_all = None
    tok_to_orig_start_index_all = None
    tok_to_orig_end_index_all = None
    loss_all = 0
    eval_steps = 0
    logger.info(
        "\n----------------------------------IN Evaluate func-----------------------------------\n"
    )
    for batch in tqdm(data_loader, total=len(data_loader)):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch

        if args.device == 'cuda':
            input_ids = input_ids.cuda()
            labels = labels.cuda()

        logits = model(input_ids=input_ids)
        mask = (input_ids != 0) & (input_ids != 1) & (input_ids != 2)
        loss = criterion(logits, labels, mask)
        loss_all += loss.detach().cpu().numpy().item()
        probs = torch.sigmoid(logits).cpu()
        if probs_all is None:
            probs_all = probs.numpy()
            seq_len_all = seq_len.numpy()
            tok_to_orig_start_index_all = tok_to_orig_start_index.numpy()
            tok_to_orig_end_index_all = tok_to_orig_end_index.numpy()
        else:
            probs_all = np.append(probs_all, probs.numpy(), axis=0)
            seq_len_all = np.append(seq_len_all, seq_len.numpy(), axis=0)
            tok_to_orig_start_index_all = np.append(
                tok_to_orig_start_index_all,
                tok_to_orig_start_index.numpy(),
                axis=0)
            tok_to_orig_end_index_all = np.append(
                tok_to_orig_end_index_all,
                tok_to_orig_end_index.numpy(),
                axis=0)
    loss_avg = loss_all / eval_steps
    logger.info("eval loss: %f" % (loss_avg))

    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)
    formatted_outputs = decoding(file_path, id2spo, probs_all, seq_len_all,
                                 tok_to_orig_start_index_all,
                                 tok_to_orig_end_index_all)
    if mode == "predict":
        predict_file_path = os.path.join(args.data_path, 'predictions.json')
    else:
        predict_file_path = os.path.join(args.data_path, 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(
            file_path, predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        logger.debug("wrong mode for eval func")
        raise Exception("wrong mode for eval func")
    logger.info("Finish evaluating.")