예제 #1
0
 def crf_evaluate(self, verbose=False, labels=False):
     if labels:
         lab = labels
     else:
         lab = self.crf.classes_
         lab.remove("O")
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         print("Dev Results\n===========")
         dev_args = (self.dev_labels, self.dev_predicted)
         kwargs = {"average": "weighted", "labels": lab}
         if verbose:
             print("Precision:",
                   metrics.flat_precision_score(*dev_args, **kwargs))
             print("Recall:",
                   metrics.flat_recall_score(*dev_args, **kwargs))
         print("F1:", metrics.flat_f1_score(*dev_args, **kwargs))
         test_args = (self.test_labels, self.test_predicted)
         print("\nTest Results\n============")
         if verbose:
             print("Precision:",
                   metrics.flat_precision_score(*test_args, **kwargs))
             print("Recall:",
                   metrics.flat_recall_score(*test_args, **kwargs))
         print("F1:", metrics.flat_f1_score(*test_args, **kwargs))
    def eval(self, sentence_result, y_data, progress=False):

        slot_result, domain_result = list(zip(*y_data))

        y_pred, y_pred_target = self.predict(sentence_result,
                                             progress=progress)
        y_test = slot_result
        y_target = np.array([[x] for x in domain_result])
        y_pred_target = np.array([[x] for x in y_pred_target])

        # print(y_target.shape)
        # print(y_pred_target.shape)

        return OrderedDict((
            ('accuracy', metrics.flat_accuracy_score(y_test, y_pred)),
            ('precision',
             metrics.flat_precision_score(y_test, y_pred, average='weighted')),
            ('recall',
             metrics.flat_recall_score(y_test, y_pred, average='weighted')),
            ('f1', metrics.flat_f1_score(y_test, y_pred, average='weighted')),
            ('softmax_accuracy',
             metrics.flat_accuracy_score(y_target, y_pred_target)),
            ('softmax_precision',
             metrics.flat_precision_score(y_target,
                                          y_pred_target,
                                          average='weighted')),
            ('softmax_recall',
             metrics.flat_recall_score(y_target,
                                       y_pred_target,
                                       average='weighted')),
            ('softmax_f1',
             metrics.flat_f1_score(y_target, y_pred_target,
                                   average='weighted')),
        ))
예제 #3
0
def evaluate_rnn(y, preds):
    """Because the RNN sequences get clipped as necessary based
    on the `max_length` parameter, they have to be realigned to
    get a classification report. This method does that, building
    in the assumption that any clipped tokens are assigned an
    incorrect label.

    Parameters
    ----------
    y : list of list of labels
    preds : list of list of labels

    Both of these lists need to have the same length, but the
    sequences they contain can vary in length.
    """
    labels = sorted({c for ex in y for c in ex})
    new_preds = []
    for gold, pred in zip(y, preds):
        delta = len(gold) - len(pred)
        if delta > 0:
            # Make a *wrong* guess for these clipped tokens:
            pred += [random.choice(list(set(labels)-{label}))
                     for label in gold[-delta: ]]
        new_preds.append(pred)
    labels = sorted({cls for ex in y for cls in ex} - {'OTHER'})
    data = {}
    data['classification_report'] = flat_classification_report(y, new_preds)
    data['f1_macro'] = flat_f1_score(y, new_preds, average='macro')
    data['f1_micro'] = flat_f1_score(y, new_preds, average='micro')
    data['f1'] = flat_f1_score(y, new_preds, average=None)
    data['precision_score'] = flat_precision_score(y, new_preds, average=None)
    data['recall_score'] = flat_recall_score(y, new_preds, average=None)
    data['accuracy'] = flat_accuracy_score(y, new_preds)
    data['sequence_accuracy_score'] = sequence_accuracy_score(y, new_preds)
    return data
예제 #4
0
def model_testing(Y_test, output_path, testing_start_date, testing_end_date,
                  chain_len):
    X_test = loadX(testing_start_date, testing_end_date)
    X_test = dataFillNA(X_test)  # fill na
    tmp_columns = X_test.columns.tolist()
    tmp_columns.remove('date')

    all_data = X_test.merge(Y_test, on='date', how='inner')
    X_test = all_data[tmp_columns]
    Y_test = all_data['Y']
    test_dates = all_data['date']
    del all_data
    gc.collect()

    X_test = Xpoint2Set(X_test, chain_len)
    Y_test_pair = Ypoint2Set(Y_test, chain_len)

    with open(output_path + 'crf_model.pkl', 'rb') as tmp_fi:  # dump model
        crf = pickle.load(tmp_fi)

    y_pred = crf.predict(X_test)

    # test pair
    labels = ['-1.0', '1.0']
    print(
        metrics.flat_classification_report(Y_test_pair,
                                           y_pred,
                                           labels=labels,
                                           digits=3))

    # test single
    y_pred_single = y_pred[0].copy()
    y_pred_single.pop(-1)
    y_pred_single.extend([tmp_y[1] for tmp_y in y_pred])
    # y_pred_single.insert(0, y_pred[0][0])
    y_real_singel = Y_test.astype('str').tolist()
    prsc = precision_score(y_real_singel,
                           y_pred_single,
                           labels=labels,
                           average='micro')
    print('%s to %s weighted precision: %f' %
          (testing_start_date, testing_end_date, prsc))
    print('f1 score: %f, precision: %f' %
          (metrics.flat_f1_score(
              Y_test_pair, y_pred, labels=labels, average='weighted'),
           metrics.flat_precision_score(
               Y_test_pair, y_pred, labels=labels, average='micro')))

    prediction = pd.DataFrame(test_dates)
    prediction.loc[:, 'predict'] = y_pred_single

    return prediction, prsc
예제 #5
0
def report(pred, truth):
    _pred = VecContext.y2lab(pred)
    _test = VecContext.y2lab(truth)
    print(
        metrics.flat_classification_report(_test,
                                           _pred,
                                           labels=('I', 'E'),
                                           digits=4))
    label = 'E'
    P = metrics.flat_precision_score(_test, _pred, pos_label=label)
    R = metrics.flat_recall_score(_test, _pred, pos_label=label)
    f1 = metrics.flat_f1_score(_test, _pred, pos_label=label)
    return {'P': P, 'R': R, 'f1': f1}
    def evaluate(self, output_path):
        loss = self._model.evaluate(self._data_reader.test_X,
                                    self._data_reader.test_y)
        print('Loss is: %f' % loss)

        all_predicted_labels = []
        all_true_labels = []
        for i, _test_instance in enumerate(self._data_reader.test_X):
            test_prediction = self._model.predict(
                _test_instance.reshape(
                    1, self._data_reader.max_train_sentence_length))[0]

            predicted_labels, true_labels = [], []
            for encoded_true_label_array, encoded_test_label_array in zip(
                    self._data_reader.test_y[i], test_prediction):
                contains_all_zeros = not numpy.any(encoded_true_label_array)
                if not contains_all_zeros:
                    predicted_labels.append(
                        self._data_reader.decode_single_label(
                            encoded_test_label_array))
                    true_labels.append(
                        self._data_reader.decode_single_label(
                            encoded_true_label_array))

            all_predicted_labels.append(predicted_labels)
            all_true_labels.append(true_labels)

        classification_report = metrics.flat_classification_report(
            all_true_labels,
            all_predicted_labels,
            labels=self._data_reader.labels)

        sequence_accuracy = metrics.sequence_accuracy_score(
            all_true_labels, all_predicted_labels)

        precision = metrics.flat_precision_score(all_true_labels,
                                                 all_predicted_labels,
                                                 average='weighted')

        recall = metrics.flat_recall_score(all_true_labels,
                                           all_predicted_labels,
                                           average='weighted')

        _save_metrics(output_path=output_path,
                      classification_report=classification_report,
                      sequence_accuracy=sequence_accuracy,
                      precision=precision,
                      recall=recall)

        return classification_report, sequence_accuracy, precision, recall
예제 #7
0
def report(pred, truth, csv_table, clf_name):
    label = 'E'
    pred_lab = VecContext.y2lab(pred)
    truth_lab = VecContext.y2lab(truth)
    P = metrics.flat_precision_score(truth_lab, pred_lab, pos_label=label)
    R = metrics.flat_recall_score(truth_lab, pred_lab, pos_label=label)
    f1 = metrics.flat_f1_score(truth_lab, pred_lab, pos_label=label)
    print(clf_name)
    print(
        metrics.flat_classification_report(truth_lab,
                                           pred_lab,
                                           labels=('I', 'E'),
                                           digits=4))
    csv_table.writerow([clf_name, P, R, f1])
예제 #8
0
 def eval(self, sentence_result, slot_result):
     """评估结果"""
     y_pred = self.predict(sentence_result)
     y_test = slot_result
     return {
         'precision':
         metrics.flat_precision_score(y_test, y_pred, average='weighted'),
         'recall':
         metrics.flat_recall_score(y_test, y_pred, average='weighted'),
         'f1':
         metrics.flat_f1_score(y_test, y_pred, average='weighted'),
         'accuracy':
         metrics.flat_accuracy_score(y_test, y_pred),
     }
예제 #9
0
def evaluate(dataset_name, data_iter, model, full_report=False):
  
  model.eval()
  total_corrects, avg_loss = 0, 0
  for batch in data_iter:
    text, target = batch.Phrase, batch.Sentiment


    output = model(text)
    
    loss = F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
    pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
    

    correct = pred.eq(target.view_as(pred)).sum().item()
    
    avg_loss += loss
    
    total_corrects += correct

  size = len(data_iter.dataset)
  avg_loss /= size
  accuracy = 100.0 * total_corrects/size
  print('  Evaluation on {} - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(dataset_name,
                                                                     avg_loss, 
                                                                     accuracy, 
                                                                     total_corrects, 
                                                                     size))

  targetList = []
  for tar in target:
    list1 = []
    list1.append(tar)
    targetList.append(list1)
  pred = pred.tolist()
  predList = []
  for pre in pred:
    list1 = []
    list1.append(pre)
    predList.append(list1)
  

  if full_report:
    print(sklearn_crfsuite.metrics.flat_classification_report(targetList, predList, labels=[0,1,2,3,4]))
    print("accuracy_score", flat_accuracy_score(targetList, predList))

    print("precision_score", flat_precision_score(targetList, predList, average='weighted'))
    print("recall_score", flat_recall_score(targetList, predList, average='weighted'))
    print("f1_score", flat_f1_score(targetList, predList, average='weighted'))
  return accuracy
예제 #10
0
파일: CRFNER.py 프로젝트: Root0110/ODU-NLP
def crf(test_loc, train_loc):
    test_sents = convertCONLLFormJustExtractionSemEval(test_loc)
    train_sents = convertCONLLFormJustExtractionSemEval(train_loc)

    #pprint(train_sents[0])
    #pprint(test_sents[0])

    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    crf = sklearn_crfsuite.CRF(\
    algorithm='lbfgs',\
    c1=0.1,\
    c2=0.1,\
    max_iterations=100,\
    all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    labels = list(crf.classes_)
    labels.remove('O')
    #print(labels)
    pickle.dump(crf,
                open("/data/xwang/models_origin/linear-chain-crf.model.pickle",
                     "wb"),
                protocol=0,
                fix_imports=True)
    y_pred = crf.predict(X_test)

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    f1_score = metrics.flat_f1_score(y_test,
                                     y_pred,
                                     average='weighted',
                                     labels=sorted_labels)
    recall = metrics.flat_recall_score(y_test,
                                       y_pred,
                                       average='weighted',
                                       labels=sorted_labels)
    precision = metrics.flat_precision_score(y_test,
                                             y_pred,
                                             average='weighted',
                                             labels=sorted_labels)
    #print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
    return (f1_score, recall, precision)
예제 #11
0
def evaluate_rnn(y, preds):
    """ Evaluate the RNN performance using various metrics.

  Parameters
  ----------
  y: list of list of labels
  preds: list of list of labels

  Both of these lists need to have the same length, but the
  sequences they contain can vary in length.

  Returns
  -------
  data: dict
  """

    labels = sorted({c for ex in y for c in ex})
    new_preds = []
    for gold, pred in zip(y, preds):
        delta = len(gold) - len(pred)
        if delta > 0:
            # Make a *wrong* guess for these clipped tokens:
            pred += [
                random.choice(list(set(labels) - {label}))
                for label in gold[-delta:]
            ]
        new_preds.append(pred)
    labels = sorted({cls for ex in y for cls in ex} - {"OTHER"})
    data = {}
    data["classification_report"] = flat_classification_report(y,
                                                               new_preds,
                                                               digits=3)
    data["f1_macro"] = flat_f1_score(y, new_preds, average="macro")
    data["f1_micro"] = flat_f1_score(y, new_preds, average="micro")
    data["f1"] = flat_f1_score(y, new_preds, average=None)
    data["precision_score"] = flat_precision_score(y, new_preds, average=None)
    data["recall_score"] = flat_recall_score(y, new_preds, average=None)
    data["accuracy"] = flat_accuracy_score(y, new_preds)
    data["sequence_accuracy_score"] = sequence_accuracy_score(y, new_preds)

    return data
예제 #12
0
def get_crf_metrics(y_pred, y_true, labels):
    token_acc_score = round(metrics.flat_accuracy_score(y_true, y_pred), 2)
    token_recall_score = round(
        metrics.flat_recall_score(y_true,
                                  y_pred,
                                  average='weighted',
                                  labels=labels), 2)
    token_f1_score = round(
        metrics.flat_f1_score(y_true,
                              y_pred,
                              average='weighted',
                              labels=labels), 2)
    token_precision_score = round(
        metrics.flat_precision_score(y_true,
                                     y_pred,
                                     average='weighted',
                                     labels=labels), 2)
    report = metrics.flat_classification_report(y_true,
                                                y_pred,
                                                labels=labels,
                                                output_dict=True)
    report_df = pd.DataFrame(report).T
    report_df = report_df.round(2)
    cm_dict = metrics.performance_measure(y_true, y_pred)
    cm = np.array([[cm_dict['TN'], cm_dict['FP']],
                   [cm_dict['FN'], cm_dict['TP']]])
    support = cm_dict['FN'] + cm_dict['TP']
    res_d = {
        'accuracy': token_acc_score,
        'recall': token_recall_score,
        'f1_score': token_f1_score,
        'precision': token_precision_score,
        'support': support,
        'cm': cm,
        'report': report_df
    }
    return res_d
예제 #13
0
print(crf.sent2features(conll.sentences[0])[0])
train_sents = conll.sentences[:40000]
test_sents = conll.sentences[40000:]
crf.X_train = [crf.sent2features(s) for s in train_sents]
crf.y_train = [crf.sent2labels(s) for s in train_sents]

crf.X_test = [crf.sent2features(s) for s in test_sents]
crf.y_test = [crf.sent2labels(s) for s in test_sents]
crf.train()
labels = list(crf.crf_model.classes_)
labels.remove('O')
print(labels)

y_pred = crf.crf_model.predict(crf.X_test)
f1_score = metrics.flat_f1_score(crf.y_test, y_pred,
                      average='weighted', labels=labels)

precision_score = metrics.flat_precision_score(crf.y_test, y_pred,
                      average='weighted', labels=labels)

recall_score = metrics.flat_recall_score(crf.y_test, y_pred,
                      average='weighted', labels=labels)
stats = metrics.flat_classification_report(crf.y_test, y_pred,
                       labels=labels)
print("Precision: "+str(precision_score))
print("Recall: "+str(recall_score))
print("F1-score: "+str(recall_score))
print(stats)
filename = '../Models/crf_baseline_model.sav'
pickle.dump(crf.crf_model, open(filename, 'wb'))
print("Done with all")
예제 #14
0
    def cross_validate(self,
                       folds=10,
                       training_dataset=None,
                       spacy_model_name=None,
                       epochs=None):
        """
        Runs a cross validation.

        :param folds: Number of fold to do for the cross validation.
        :param training_dataset: Path to the directory of BRAT files to use for the training data.
        :param spacy_model_name: Name of the spaCy model to start from.
        :param epochs: Number of epochs to us for every fold training.
        """
        if folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        if training_dataset is None:
            raise ValueError("Need a dataset to evaluate")

        if spacy_model_name is None:
            raise ValueError("Need a spacy model to start with")

        train_data = training_dataset.get_training_data()

        x_data, y_data = zip(*train_data)

        skipped_files = []
        evaluation_statistics = {}

        folds = SequenceStratifiedKFold(folds=folds)
        fold = 1

        for train_indices, test_indices in folds(x_data, y_data):
            logging.info("\n----EVALUATING FOLD %d----", fold)
            self.model = None
            fold_statistics = {}

            x_subdataset = training_dataset.get_subdataset(train_indices)
            self.fit(x_subdataset, spacy_model_name, epochs)
            logging.info('Done training!\n')

            nlp = self.model
            labels = list(x_subdataset.get_labels())

            y_subdataset = training_dataset.get_subdataset(test_indices)

            y_test = []
            y_pred = []

            for data_file in y_subdataset.get_data_files():
                ann_path = data_file.get_annotation_path()
                annotations = Annotations(ann_path)
                txt_path = data_file.get_text_path()

                with open(txt_path, 'r') as source_text_file:
                    text = source_text_file.read()

                doc = nlp(text)

                test_entities = annotations.get_spacy_entities()
                test_entities = self.entities_to_biluo(doc, test_entities)
                y_test.append(test_entities)

                pred_entities = self.predict(text)
                pred_entities = self.entities_to_biluo(doc, pred_entities)
                y_pred.append(pred_entities)

            logging.debug('\n------y_test------')
            logging.debug(y_test)
            logging.debug('\n------y_pred------')
            logging.debug(y_pred)

            # Write the metrics for this fold.
            for label in labels:
                fold_statistics[label] = {}
                recall = metrics.flat_recall_score(y_test,
                                                   y_pred,
                                                   average='weighted',
                                                   labels=[label])
                precision = metrics.flat_precision_score(y_test,
                                                         y_pred,
                                                         average='weighted',
                                                         labels=[label])
                f1_score = metrics.flat_f1_score(y_test,
                                                 y_pred,
                                                 average='weighted',
                                                 labels=[label])
                fold_statistics[label]['precision'] = precision
                fold_statistics[label]['recall'] = recall
                fold_statistics[label]['f1'] = f1_score

            # add averages
            fold_statistics['system'] = {}
            recall = metrics.flat_recall_score(y_test,
                                               y_pred,
                                               average='weighted',
                                               labels=labels)
            precision = metrics.flat_precision_score(y_test,
                                                     y_pred,
                                                     average='weighted',
                                                     labels=labels)
            f1_score = metrics.flat_f1_score(y_test,
                                             y_pred,
                                             average='weighted',
                                             labels=labels)
            fold_statistics['system']['precision'] = precision
            fold_statistics['system']['recall'] = recall
            fold_statistics['system']['f1'] = f1_score

            table_data = [[
                label,
                format(fold_statistics[label]['precision'], ".3f"),
                format(fold_statistics[label]['recall'], ".3f"),
                format(fold_statistics[label]['f1'], ".3f")
            ] for label in labels + ['system']]

            logging.info(
                tabulate(table_data,
                         headers=['Entity', 'Precision', 'Recall', 'F1'],
                         tablefmt='orgtbl'))

            evaluation_statistics[fold] = fold_statistics
            fold += 1

        if skipped_files:
            logging.info('\nWARNING. SKIPPED THE FOLLOWING ANNOTATIONS:')
            logging.info(skipped_files)

        statistics_all_folds = {}

        for label in labels + ['system']:
            statistics_all_folds[label] = {}
            statistics_all_folds[label]['precision_average'] = mean([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_max'] = max([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_min'] = min([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['recall_average'] = mean([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_max'] = max([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_min'] = min([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['f1_average'] = mean([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_max'] = max([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_min'] = min([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])

        table_data = [[
            label,
            format(statistics_all_folds[label]['precision_average'], ".3f"),
            format(statistics_all_folds[label]['recall_average'], ".3f"),
            format(statistics_all_folds[label]['f1_average'], ".3f"),
            format(statistics_all_folds[label]['f1_min'], ".3f"),
            format(statistics_all_folds[label]['f1_max'], ".3f")
        ] for label in labels + ['system']]

        table_string = '\n' + tabulate(table_data,
                                       headers=[
                                           'Entity', 'Precision', 'Recall',
                                           'F1', 'F1_Min', 'F1_Max'
                                       ],
                                       tablefmt='orgtbl')
        logging.info(table_string)
예제 #15
0
    print("=======================")
    print("Load trained model ...")
    model = pickle.load(open("./models/" + MODEL_NAME, "rb"))
    print("Done!!!")

    predict = model.predict(X_test)

    print("=======================")
    print("Testing ....")
    print(len(y_test), len(predict))

    avg_count = 0
    print(predict[0])
    for i in range(len(y_test)):
        acc = evaluate(predict[i], y_test[i])
        # print(acc)
        avg_count += acc

    # print(score)

    print("Avg acc:", avg_count / float(len(y_test)))
    print(model.classes_)
    print("Accuracy\t:", metrics.flat_accuracy_score(y_test, predict))
    print("Precision\t:",
          metrics.flat_precision_score(y_test, predict, average=None))
    print("Recall\t:",
          len(metrics.flat_recall_score(y_test, predict, average=None)))
    print("F1\t:", metrics.flat_f1_score(y_test, predict, average=None))

    print("Done!!!")
예제 #16
0
    def cross_validate(self,
                       training_dataset=None,
                       num_folds=5,
                       prediction_directory=None,
                       groundtruth_directory=None,
                       asynchronous=False):
        """
        Performs k-fold stratified cross-validation using our model and pipeline.

        If the training dataset, groundtruth_directory and prediction_directory are passed, intermediate predictions during cross validation
        are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute
        the prediction ambiguity with the methods present in the Dataset class to support pipeline development without
        a designated evaluation set.

        :param training_dataset: Dataset that is being cross validated (optional)
        :param num_folds: number of folds to split training data into for cross validation
        :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory.
        :param groundtruth_directory: directory to write the ground truth MedaCy evaluates on
        :param asynchronous: Boolean for whether the preprocessing should be done asynchronously.
        :return: Prints out performance metrics, if prediction_directory
        """

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1, but is %s"
                % repr(num_folds))

        if prediction_directory is not None and training_dataset is None:
            raise ValueError(
                "Cannot generate predictions during cross validation if training dataset is not given."
                " Please pass the training dataset in the 'training_dataset' parameter."
            )
        if groundtruth_directory is not None and training_dataset is None:
            raise ValueError(
                "Cannot generate groundtruth during cross validation if training dataset is not given."
                " Please pass the training dataset in the 'training_dataset' parameter."
            )

        pipeline_report = self.pipeline.get_report()

        self.preprocess(training_dataset, asynchronous)

        if not (self.X_data and self.y_data):
            raise RuntimeError(
                "Must have features and labels extracted for cross validation")

        tags = sorted(training_dataset.get_labels(as_list=True))
        self.pipeline.entities = tags
        logging.info('Tagset: %s', tags)

        eval_stats = {}

        # Dict for storing mapping of sequences to their corresponding file
        groundtruth_by_document = {
            filename: []
            for filename in {x[2]
                             for x in self.X_data}
        }
        preds_by_document = {
            filename: []
            for filename in {x[2]
                             for x in self.X_data}
        }

        folds = create_folds(self.y_data, num_folds)

        for fold_num, fold_data in enumerate(folds, 1):
            train_indices, test_indices = fold_data
            fold_statistics = {}
            learner_name, learner = self.pipeline.get_learner()

            X_train = [self.X_data[index] for index in train_indices]
            y_train = [self.y_data[index] for index in train_indices]

            X_test = [self.X_data[index] for index in test_indices]
            y_test = [self.y_data[index] for index in test_indices]

            logging.info("Training Fold %i", fold_num)
            train_data = [x[0] for x in X_train]
            test_data = [x[0] for x in X_test]
            learner.fit(train_data, y_train)
            y_pred = learner.predict(test_data)

            if groundtruth_directory is not None:
                # Flattening nested structures into 2d lists
                document_indices = []
                span_indices = []
                for sequence in X_test:
                    document_indices += [sequence[2]] * len(sequence[0])
                    span_indices += list(sequence[1])
                groundtruth = [
                    element for sentence in y_test for element in sentence
                ]

                # Map the predicted sequences to their corresponding documents
                i = 0

                while i < len(groundtruth):
                    if groundtruth[i] == 'O':
                        i += 1
                        continue

                    entity = groundtruth[i]
                    document = document_indices[i]
                    first_start, first_end = span_indices[i]
                    # Ensure that consecutive tokens with the same label are merged
                    while i < len(groundtruth) - 1 and groundtruth[
                            i +
                            1] == entity:  # If inside entity, keep incrementing
                        i += 1

                    last_start, last_end = span_indices[i]
                    groundtruth_by_document[document].append(
                        (entity, first_start, last_end))
                    i += 1

            if prediction_directory is not None:
                # Flattening nested structures into 2d lists
                document_indices = []
                span_indices = []

                for sequence in X_test:
                    document_indices += [sequence[2]] * len(sequence[0])
                    span_indices += list(sequence[1])

                predictions = [
                    element for sentence in y_pred for element in sentence
                ]

                # Map the predicted sequences to their corresponding documents
                i = 0

                while i < len(predictions):
                    if predictions[i] == 'O':
                        i += 1
                        continue

                    entity = predictions[i]
                    document = document_indices[i]
                    first_start, first_end = span_indices[i]

                    # Ensure that consecutive tokens with the same label are merged
                    while i < len(predictions) - 1 and predictions[
                            i +
                            1] == entity:  # If inside entity, keep incrementing
                        i += 1

                    last_start, last_end = span_indices[i]
                    preds_by_document[document].append(
                        (entity, first_start, last_end))
                    i += 1

            # Write the metrics for this fold.
            for label in tags:
                fold_statistics[label] = {
                    "recall":
                    metrics.flat_recall_score(y_test,
                                              y_pred,
                                              average='weighted',
                                              labels=[label]),
                    "precision":
                    metrics.flat_precision_score(y_test,
                                                 y_pred,
                                                 average='weighted',
                                                 labels=[label]),
                    "f1":
                    metrics.flat_f1_score(y_test,
                                          y_pred,
                                          average='weighted',
                                          labels=[label])
                }

            # add averages
            fold_statistics['system'] = {
                "recall":
                metrics.flat_recall_score(y_test,
                                          y_pred,
                                          average='weighted',
                                          labels=tags),
                "precision":
                metrics.flat_precision_score(y_test,
                                             y_pred,
                                             average='weighted',
                                             labels=tags),
                "f1":
                metrics.flat_f1_score(y_test,
                                      y_pred,
                                      average='weighted',
                                      labels=tags)
            }

            table_data = [[
                label,
                format(fold_statistics[label]['precision'], ".3f"),
                format(fold_statistics[label]['recall'], ".3f"),
                format(fold_statistics[label]['f1'], ".3f")
            ] for label in tags + ['system']]

            logging.info(
                '\n' +
                tabulate(table_data,
                         headers=['Entity', 'Precision', 'Recall', 'F1'],
                         tablefmt='orgtbl'))

            eval_stats[fold_num] = fold_statistics

        statistics_all_folds = {}

        for label in tags + ['system']:
            statistics_all_folds[label] = {
                'precision_average':
                mean(eval_stats[fold][label]['precision']
                     for fold in eval_stats),
                'precision_max':
                max(eval_stats[fold][label]['precision']
                    for fold in eval_stats),
                'precision_min':
                min(eval_stats[fold][label]['precision']
                    for fold in eval_stats),
                'recall_average':
                mean(eval_stats[fold][label]['recall'] for fold in eval_stats),
                'recall_max':
                max(eval_stats[fold][label]['recall'] for fold in eval_stats),
                'f1_average':
                mean(eval_stats[fold][label]['f1'] for fold in eval_stats),
                'f1_max':
                max(eval_stats[fold][label]['f1'] for fold in eval_stats),
                'f1_min':
                min(eval_stats[fold][label]['f1'] for fold in eval_stats),
            }

        entity_counts = training_dataset.compute_counts()

        table_data = [
            [
                f"{label} ({entity_counts[label]})",  # Entity (Count)
                format(statistics_all_folds[label]['precision_average'],
                       ".3f"),
                format(statistics_all_folds[label]['recall_average'], ".3f"),
                format(statistics_all_folds[label]['f1_average'], ".3f"),
                format(statistics_all_folds[label]['f1_min'], ".3f"),
                format(statistics_all_folds[label]['f1_max'], ".3f")
            ] for label in tags + ['system']
        ]

        # Combine the pipeline report and the resulting data, then log it or print it (whichever ensures that it prints)

        output_str = '\n' + pipeline_report + '\n\n' + tabulate(
            table_data,
            headers=[
                'Entity (Count)', 'Precision', 'Recall', 'F1', 'F1_Min',
                'F1_Max'
            ],
            tablefmt='orgtbl')

        if logging.root.level > logging.INFO:
            print(output_str)
        else:
            logging.info(output_str)

        if prediction_directory:

            prediction_directory = os.path.join(
                training_dataset.data_directory, "predictions")
            groundtruth_directory = os.path.join(
                training_dataset.data_directory, "groundtruth")

            # Write annotations generated from cross-validation
            self.create_annotation_directory(directory=prediction_directory,
                                             training_dataset=training_dataset,
                                             option="predictions")

            # Write medaCy ground truth generated from cross-validation
            self.create_annotation_directory(directory=groundtruth_directory,
                                             training_dataset=training_dataset,
                                             option="groundtruth")

            # Add predicted/known annotations to the folders containing groundtruth and predictions respectively
            self.predict_annotation_evaluation(
                directory=groundtruth_directory,
                training_dataset=training_dataset,
                preds_by_document=preds_by_document,
                groundtruth_by_document=groundtruth_by_document,
                option="groundtruth")

            self.predict_annotation_evaluation(
                directory=prediction_directory,
                training_dataset=training_dataset,
                preds_by_document=preds_by_document,
                groundtruth_by_document=groundtruth_by_document,
                option="predictions")

            return Dataset(prediction_directory)
        else:
            return statistics_all_folds
예제 #17
0
    labels = [[tuple2label(t) for t in tuples] for tuples in tuple_sets]
    return features, labels


print("Training CRF model on training data...")
train_features, train_labels = file2features_labels("train.txt")
crf = CRF()
crf.fit(train_features, train_labels)

print("Making predictions for test data...")
test_features, test_labels = file2features_labels("test.txt")
test_preds = crf.predict(test_features)

print("Performing own evaluation...")
labels = crf.classes_
p = flat_precision_score(test_labels, test_preds, labels=labels, average="micro")
r = flat_recall_score(test_labels, test_preds, labels=labels, average="micro")
f1 = flat_f1_score(test_labels, test_preds, labels=labels, average="micro")
print("p(micro)={} r(micro)={} f1(micro)={}".format(p, r, f1))


def to_conllevalfile(features, labels, preds, filename):
    with open(filename, "w") as conlleval_input_file:
        for feature_set, label_set, pred_set in zip(features, labels, preds):
            for feature, label, pred in zip(feature_set, label_set, pred_set):
                conlleval_input_file.write("{} {} {} {}\n".format(
                    feature["token"], feature["pos tag"], label, pred))
            conlleval_input_file.write("\n")


to_conllevalfile(test_features, test_preds, test_labels, "conlleval_input_crf.txt")
예제 #18
0
    def cross_validate(self,
                       num_folds=5,
                       training_dataset=None,
                       epochs=20,
                       prediction_directory=None,
                       groundtruth_directory=None,
                       asynchronous=None):
        """
        Runs a cross validation.

        :param folds: Number of fold to do for the cross validation.
        :param training_dataset: Path to the directory of BRAT files to use for the training data.
        :param spacy_model_name: Name of the spaCy model to start from.
        :param epochs: Number of epochs to us for every fold training.
        """
        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        if training_dataset is None:
            raise ValueError("Need a dataset to evaluate")

        train_data = training_dataset.get_training_data()

        labels = set()
        for document in train_data:
            for entity in document[1]['entities']:
                tag = entity[2]
                labels.add(tag)
        labels = list(labels)
        labels.sort()
        logging.info('Labels: %s', labels)

        x_data, y_data = zip(*train_data)

        skipped_files = []
        eval_stats = {}

        folds = create_folds(y_data, num_folds)

        for fold_num, fold_data in enumerate(folds, 1):
            train_indices, test_indices = fold_data
            logging.info("\n----EVALUATING FOLD %d----", fold_num)
            self.model = None
            fold_statistics = {}

            x_subdataset = training_dataset.get_subdataset(train_indices)
            self.fit(x_subdataset, iterations=epochs, labels=labels)
            logging.info('Done training!\n')

            nlp = self.model

            y_subdataset = training_dataset.get_subdataset(test_indices)

            y_test = []
            y_pred = []

            for ann in y_subdataset.generate_annotations():

                with open(ann.source_text_path, 'r') as source_text_file:
                    text = source_text_file.read()

                doc = nlp(text)

                # test_entities = annotations.get_entities(format='spacy')[1]['entities']
                test_entities = ann.get_entity_annotations(
                    format='spacy')[1]['entities']
                test_entities = self.entities_to_biluo(doc, test_entities)
                y_test.append(test_entities)

                pred_entities = self.predict(text)
                pred_entities = self.entities_to_biluo(doc, pred_entities)
                y_pred.append(pred_entities)

            logging.debug('\n------y_test------')
            logging.debug(y_test)
            logging.debug('\n------y_pred------')
            logging.debug(y_pred)

            # Write the metrics for this fold.
            for label in labels:
                fold_statistics[label] = {
                    'recall':
                    metrics.flat_recall_score(y_test,
                                              y_pred,
                                              average='weighted',
                                              labels=[label]),
                    'precision':
                    metrics.flat_precision_score(y_test,
                                                 y_pred,
                                                 average='weighted',
                                                 labels=[label]),
                    'f1':
                    metrics.flat_f1_score(y_test,
                                          y_pred,
                                          average='weighted',
                                          labels=[label])
                }

            # add averages
            fold_statistics['system'] = {
                'recall':
                metrics.flat_recall_score(y_test,
                                          y_pred,
                                          average='weighted',
                                          labels=labels),
                'precision':
                metrics.flat_precision_score(y_test,
                                             y_pred,
                                             average='weighted',
                                             labels=labels),
                'f1':
                metrics.flat_f1_score(y_test,
                                      y_pred,
                                      average='weighted',
                                      labels=labels)
            }

            table_data = [[
                label,
                format(fold_statistics[label]['precision'], ".3f"),
                format(fold_statistics[label]['recall'], ".3f"),
                format(fold_statistics[label]['f1'], ".3f")
            ] for label in labels + ['system']]

            logging.info(
                '\n' +
                tabulate(table_data,
                         headers=['Entity', 'Precision', 'Recall', 'F1'],
                         tablefmt='orgtbl'))

            eval_stats[fold_num] = fold_statistics

        if skipped_files:
            logging.info('\nWARNING. SKIPPED THE FOLLOWING ANNOTATIONS:')
            logging.info(skipped_files)

        statistics_all_folds = {}

        for label in labels + ['system']:
            statistics_all_folds[label] = {
                'precision_average':
                mean(eval_stats[fold][label]['precision']
                     for fold in eval_stats),
                'precision_max':
                max(eval_stats[fold][label]['precision']
                    for fold in eval_stats),
                'precision_min':
                min(eval_stats[fold][label]['precision']
                    for fold in eval_stats),
                'recall_average':
                mean(eval_stats[fold][label]['recall'] for fold in eval_stats),
                'recall_max':
                max(eval_stats[fold][label]['recall'] for fold in eval_stats),
                'f1_average':
                mean(eval_stats[fold][label]['f1'] for fold in eval_stats),
                'f1_max':
                max(eval_stats[fold][label]['f1'] for fold in eval_stats),
                'f1_min':
                min(eval_stats[fold][label]['f1'] for fold in eval_stats),
            }

        table_data = [[
            label,
            format(statistics_all_folds[label]['precision_average'], ".3f"),
            format(statistics_all_folds[label]['recall_average'], ".3f"),
            format(statistics_all_folds[label]['f1_average'], ".3f"),
            format(statistics_all_folds[label]['f1_min'], ".3f"),
            format(statistics_all_folds[label]['f1_max'], ".3f")
        ] for label in labels + ['system']]

        table_string = '\n' + tabulate(table_data,
                                       headers=[
                                           'Entity', 'Precision', 'Recall',
                                           'F1', 'F1_Min', 'F1_Max'
                                       ],
                                       tablefmt='orgtbl')
        logging.info(table_string)
예제 #19
0
    X_test = [sent2features(s) for s in test_data]
    y_test = [sent2labels(s) for s in test_data]

    y_train = [sent2labels(s) for s in train_data]
    y_validation = [sent2labels(s) for s in validation_data]

    labels = list(
        set([label for labels in y_train + y_test for label in labels]))
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

    with open("best_crf_model.pkl", "rb") as in_file:
        crf = pickle.load(in_file)

    y_pred = crf.predict(X_test)

    print("### Classification Report ###")
    print(metrics.flat_classification_report(y_test,
                                             y_pred,
                                             labels=sorted_labels),
          end='\n\n')

    print("### Sequence Accuracy Score ###")
    print(metrics.sequence_accuracy_score(y_test, y_pred), end='\n\n')

    print("### Weighted Precision Score ###")
    print(metrics.flat_precision_score(y_test, y_pred, average='weighted'),
          end='\n\n')

    print("### Weighted Recall Score ###")
    print(flat_recall_score(y_test, y_pred, average='weighted'), end='\n\n')
예제 #20
0
def test_flat_precision():
    score = metrics.flat_precision_score(y1, y2, average='micro')
    assert score == 3 / 5
예제 #21
0
crf.fit(X_train, y_train)

labels = list(crf.classes_)
labels.remove('O')
print(labels)

y_pred = crf.predict(X_val)
actual_preds = crf.predict(X_test)
writeOutput(actual_preds)
print(metrics.flat_accuracy_score(
    y_val,
    y_pred,
))
print(
    metrics.flat_precision_score(y_val,
                                 y_pred,
                                 average='weighted',
                                 labels=labels))
print(
    metrics.flat_recall_score(y_val, y_pred, average='weighted',
                              labels=labels))
print(metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=labels))

y_flat_pred = []
y_flat_val = []
x_flat_val = []
[y_flat_pred.extend(x) for x in y_pred]
[y_flat_val.extend(x) for x in y_val]
[x_flat_val.extend(x) for x in X_val]
validate_NER(y_flat_val, y_flat_pred, x_flat_val)
예제 #22
0
y_train = labelData

for i in range(0,len(X_train)):
    X_train_list.append([X_train[i]])
    y_train_list.append([y_train[i]])


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=70,
    all_possible_transitions=True)

crf.fit(X_train_list,y_train_list)

tokens = te_list
tags = te_tags
labelData = te_labels
test_featuresets = [punct_features(tokens, tags, i, labelData) for i in range(0, len(tokens))]

X_test = test_featuresets
y_test = labelData
y_pred = crf.predict([X_test])


f1_Score = metrics.flat_f1_score([y_test],y_pred,average='weighted')
precision = metrics.flat_precision_score([y_test], y_pred, average='weighted')
accuracy = metrics.flat_accuracy_score([y_test],y_pred)
print("F1 Score :" , f1_Score, "Precision :" , precision, "Accuracy :" , accuracy)
crf.fit(X_train, y_train)

# Predicting on the test set.
y_pred = crf.predict(X_test)

# Performance
f1_score = flat_f1_score(y_test, y_pred, average='weighted')
print("F1 score: ", f1_score)

acc = flat_accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

rec = flat_recall_score(y_test, y_pred, average='weighted')
print("Recall: ", rec)

prec = flat_precision_score(y_test, y_pred, average='weighted')
print("Precision: ", prec)

report = flat_classification_report(y_test, y_pred)
print(report)


def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))


print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
예제 #24
0
    def cross_validate(self,
                       num_folds=10,
                       training_dataset=None,
                       prediction_directory=None):
        """
        Performs k-fold stratified cross-validation using our model and pipeline.

        If the training dataset and prediction_directory are passed, intermediate predictions during cross validation
        are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute
        the prediction ambiguity with the methods present in the Dataset class to support pipeline development without
        a designated evaluation set.

        :param num_folds: number of folds to split training data into for cross validation
        :param training_dataset: Dataset that is being cross validated (optional)
        :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory.
        :return: Prints out performance metrics, if prediction_directory
        """

        if num_folds <= 1:
            raise ValueError(
                "Number of folds for cross validation must be greater than 1")

        if prediction_directory is not None and training_dataset is None:
            raise ValueError(
                "Cannot generated predictions during cross validation if training dataset is not given."
                " Please pass the training dataset in the 'training_dataset' parameter."
            )

        assert self.model is not None, "Cannot cross validate a un-fit model"
        assert self.X_data is not None and self.y_data is not None, \
            "Must have features and labels extracted for cross validation"

        X_data = self.X_data
        Y_data = self.y_data

        medacy_pipeline = self.pipeline

        cv = SequenceStratifiedKFold(folds=num_folds)

        named_entities = medacy_pipeline.entities

        evaluation_statistics = {}
        fold = 1
        for train_indices, test_indices in cv(X_data, Y_data):
            fold_statistics = {}
            learner_name, learner = medacy_pipeline.get_learner()

            X_train = [X_data[index] for index in train_indices]
            y_train = [Y_data[index] for index in train_indices]

            X_test = [X_data[index] for index in test_indices]
            y_test = [Y_data[index] for index in test_indices]

            logging.info("Training Fold %i", fold)
            train_data = [x[0] for x in X_train]
            test_data = [x[0] for x in X_test]
            learner.fit(train_data, y_train)
            y_pred = learner.predict(test_data)

            if prediction_directory is not None:
                # Dict for storing mapping of sequences to their corresponding file
                preds_by_document = {
                    filename: []
                    for filename in list(set([x[2] for x in X_data]))
                }

                # Flattening nested structures into 2d lists
                document_indices = []
                span_indices = []
                for sequence in X_test:
                    document_indices += [
                        sequence[2] for x in range(len(sequence[0]))
                    ]
                    span_indices += [element for element in sequence[1]]
                predictions = [
                    element for sentence in y_pred for element in sentence
                ]

                # Map the predicted sequences to their corresponding documents
                i = 0
                while i < len(predictions):
                    if predictions[i] == 'O':
                        i += 1
                        continue
                    entity = predictions[i]
                    document = document_indices[i]
                    first_start, first_end = span_indices[i]
                    # Ensure that consecutive tokens with the same label are merged
                    while i < len(predictions) - 1 and predictions[
                            i +
                            1] == entity:  # If inside entity, keep incrementing
                        i += 1
                    last_start, last_end = span_indices[i]

                    preds_by_document[document].append(
                        (entity, first_start, last_end))
                    i += 1

            # Write the metrics for this fold.
            for label in named_entities:
                fold_statistics[label] = {}
                recall = metrics.flat_recall_score(y_test,
                                                   y_pred,
                                                   average='weighted',
                                                   labels=[label])
                precision = metrics.flat_precision_score(y_test,
                                                         y_pred,
                                                         average='weighted',
                                                         labels=[label])
                f1 = metrics.flat_f1_score(y_test,
                                           y_pred,
                                           average='weighted',
                                           labels=[label])
                fold_statistics[label]['precision'] = precision
                fold_statistics[label]['recall'] = recall
                fold_statistics[label]['f1'] = f1

            # add averages
            fold_statistics['system'] = {}
            recall = metrics.flat_recall_score(y_test,
                                               y_pred,
                                               average='weighted',
                                               labels=named_entities)
            precision = metrics.flat_precision_score(y_test,
                                                     y_pred,
                                                     average='weighted',
                                                     labels=named_entities)
            f1 = metrics.flat_f1_score(y_test,
                                       y_pred,
                                       average='weighted',
                                       labels=named_entities)
            fold_statistics['system']['precision'] = precision
            fold_statistics['system']['recall'] = recall
            fold_statistics['system']['f1'] = f1

            table_data = [[
                label,
                format(fold_statistics[label]['precision'], ".3f"),
                format(fold_statistics[label]['recall'], ".3f"),
                format(fold_statistics[label]['f1'], ".3f")
            ] for label in named_entities + ['system']]

            logging.info(
                tabulate(table_data,
                         headers=['Entity', 'Precision', 'Recall', 'F1'],
                         tablefmt='orgtbl'))

            evaluation_statistics[fold] = fold_statistics
            fold += 1

        statistics_all_folds = {}

        for label in named_entities + ['system']:
            statistics_all_folds[label] = {}
            statistics_all_folds[label]['precision_average'] = mean([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_max'] = max([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_min'] = min([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['recall_average'] = mean([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_max'] = max([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_min'] = min([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['f1_average'] = mean([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_max'] = max([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_min'] = min([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])

        table_data = [[
            label,
            format(statistics_all_folds[label]['precision_average'], ".3f"),
            format(statistics_all_folds[label]['recall_average'], ".3f"),
            format(statistics_all_folds[label]['f1_average'], ".3f"),
            format(statistics_all_folds[label]['f1_min'], ".3f"),
            format(statistics_all_folds[label]['f1_max'], ".3f")
        ] for label in named_entities + ['system']]

        logging.info("\n" + tabulate(table_data,
                                     headers=[
                                         'Entity', 'Precision', 'Recall', 'F1',
                                         'F1_Min', 'F1_Max'
                                     ],
                                     tablefmt='orgtbl'))

        if prediction_directory:
            # Write annotations generated from cross-validation
            if isinstance(prediction_directory, str):
                prediction_directory = prediction_directory
            else:
                prediction_directory = training_dataset.data_directory + "/predictions/"
            if os.path.isdir(prediction_directory):
                logging.warning("Overwritting existing predictions")
            else:
                os.makedirs(prediction_directory)
            for data_file in training_dataset.get_data_files():
                logging.info("Predicting file: %s", data_file.file_name)
                with open(data_file.raw_path, 'r') as raw_text:
                    doc = medacy_pipeline.spacy_pipeline.make_doc(
                        raw_text.read())
                    preds = preds_by_document[data_file.file_name]
                    annotations = construct_annotations_from_tuples(doc, preds)
                    annotations.to_ann(write_location=os.path.join(
                        prediction_directory, data_file.file_name + ".ann"))
            return Dataset(data_directory=prediction_directory)
예제 #25
0
def cross_validate(x_folds, y_folds, params):
    f1_per = []
    f1_org = []
    f1_misc = []
    f1_loc = []
    f1_not = []

    precision_per = []
    precision_org = []
    precision_misc = []
    precision_loc = []
    precision_not = []

    recall_per = []
    recall_org = []
    recall_misc = []
    recall_loc = []
    recall_not = []

    for i in range(len(x_folds)):
        print('\rWorking on fold {}/{} ...'.format(i + 1, len(x_folds)),
              end='')

        crf = sklearn_crfsuite.CRF(**params)

        test_x, test_y, train_x, train_y = folds_2_tt(x_folds, y_folds, i)
        train_x, train_y = balance(train_x, train_y)

        crf.fit(train_x, train_y)
        pred_y = crf.predict(test_x)

        f1_per.append(
            metrics.flat_f1_score(test_y, pred_y, average=None,
                                  labels=['per']))
        f1_org.append(
            metrics.flat_f1_score(test_y, pred_y, average=None,
                                  labels=['org']))
        f1_misc.append(
            metrics.flat_f1_score(test_y,
                                  pred_y,
                                  average=None,
                                  labels=['misc']))
        f1_loc.append(
            metrics.flat_f1_score(test_y, pred_y, average=None,
                                  labels=['loc']))
        f1_not.append(
            metrics.flat_f1_score(test_y,
                                  pred_y,
                                  average=None,
                                  labels=['notpropn']))

        precision_per.append(
            metrics.flat_precision_score(test_y,
                                         pred_y,
                                         average=None,
                                         labels=['per']))
        precision_org.append(
            metrics.flat_precision_score(test_y,
                                         pred_y,
                                         average=None,
                                         labels=['org']))
        precision_misc.append(
            metrics.flat_precision_score(test_y,
                                         pred_y,
                                         average=None,
                                         labels=['misc']))
        precision_loc.append(
            metrics.flat_precision_score(test_y,
                                         pred_y,
                                         average=None,
                                         labels=['loc']))
        precision_not.append(
            metrics.flat_precision_score(test_y,
                                         pred_y,
                                         average=None,
                                         labels=['notpropn']))

        recall_per.append(
            metrics.flat_recall_score(test_y,
                                      pred_y,
                                      average=None,
                                      labels=['per']))
        recall_org.append(
            metrics.flat_recall_score(test_y,
                                      pred_y,
                                      average=None,
                                      labels=['org']))
        recall_misc.append(
            metrics.flat_recall_score(test_y,
                                      pred_y,
                                      average=None,
                                      labels=['misc']))
        recall_loc.append(
            metrics.flat_recall_score(test_y,
                                      pred_y,
                                      average=None,
                                      labels=['loc']))
        recall_not.append(
            metrics.flat_recall_score(test_y,
                                      pred_y,
                                      average=None,
                                      labels=['notpropn']))

    print()
    avg_per_f1 = sum(f1_per) / len(f1_per)
    avg_org_f1 = sum(f1_org) / len(f1_org)
    avg_loc_f1 = sum(f1_loc) / len(f1_loc)
    avg_misc_f1 = sum(f1_misc) / len(f1_misc)
    avg_not_f1 = sum(f1_not) / len(f1_not)

    avg_per_precision = sum(precision_per) / len(precision_per)
    avg_org_precision = sum(precision_org) / len(precision_org)
    avg_loc_precision = sum(precision_loc) / len(precision_loc)
    avg_misc_precision = sum(precision_misc) / len(precision_misc)
    avg_not_precision = sum(precision_not) / len(precision_not)

    avg_per_recall = sum(recall_per) / len(recall_per)
    avg_org_recall = sum(recall_org) / len(recall_org)
    avg_loc_recall = sum(recall_loc) / len(recall_loc)
    avg_misc_recall = sum(recall_misc) / len(recall_misc)
    avg_not_recall = sum(recall_not) / len(recall_not)

    result = {
        'per': (avg_per_precision, avg_per_recall, avg_per_f1),
        'org': (avg_org_precision, avg_org_recall, avg_org_f1),
        'misc': (avg_misc_precision, avg_misc_recall, avg_misc_f1),
        'loc': (avg_loc_precision, avg_loc_recall, avg_loc_f1),
        'not': (avg_not_precision, avg_not_recall, avg_not_f1)
    }

    return result
    def validate_performance(self, test_set):
        sentences = self.__load_corpus__(test_set)

        y_test = [self.model.sentence2labels(s) for s in sentences]

        y_prediction = []
        for i, sent in enumerate(sentences):
            new_sent = ' '.join([word[0] for word in sent])
            prediction = self.model.predict(new_sent)
            new_prediction = []
            if len(prediction) > 1:
                for p in prediction:
                    new_prediction += [p1 for p1 in p]
                # print(prediction)
                # print(new_prediction)

                prediction = new_prediction
            else:
                prediction = prediction[0]

            try:
                pred = [w[1] for w in prediction]
            except Exception:
                print(prediction)
                return

            # if len(pred) != len(y_test[i]):
            #     print(sent)
            #     print(new_sent)
            #     print(y_test[i])
            #     print(len(y_test[i]))
            #     print(pred)
            #     print(len(pred))

            y_prediction.append(pred)

        labels = [
            'O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ',
            'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO'
        ]

        for i in range(len(y_prediction)):
            for j in range(len(y_prediction[i])):
                y_prediction[i][j] = y_prediction[i][j].replace('B-', '')
                y_prediction[i][j] = y_prediction[i][j].replace('O-', '')
                y_prediction[i][j] = y_prediction[i][j].replace('I-', '')

        for i in range(len(y_test)):
            for j in range(len(y_test[i])):
                y_test[i][j] = y_test[i][j].replace('B-', '')
                y_test[i][j] = y_test[i][j].replace('O-', '')
                y_test[i][j] = y_test[i][j].replace('I-', '')

        labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO']

        # labels = ['DOS', 'UNIT', 'WHO', 'DUR', 'FREQ']

        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='micro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='micro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='micro')

        print('MICRO')
        print(precision, recall, f1)

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='macro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='macro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='macro')

        print('MACRO')
        print(precision, recall, f1)

        print(
            metrics.flat_classification_report(y_test,
                                               y_prediction,
                                               labels=sorted_labels,
                                               digits=3))
예제 #27
0
        open(os.path.join(config.dump_address, "y_dev_pred.pkl"), "wb"))
    test_loss = total_test_loss / len(corpus.test.labels)

    print(
        "->>>>>>>>>>>>>TOTAL>>>>>>>>>>>>>>>>>>>>>>> test_loss: {}, test_accuracy: {}, test_f1_score_micro: {} ROC:{}"
        .format(test_loss, (test_right_preds / test_total_preds),
                (test_f1_total_micro), roc_score))
    print()
    print(
        metrics.flat_classification_report(test_total_y_true,
                                           test_total_y_pred))
    print("test_f1_total_binary: ", test_f1_total_binary)
    print(
        "precision binary: ",
        metrics.flat_precision_score(test_total_y_true,
                                     test_total_y_pred,
                                     average="binary"))
    print(
        "recall binary: ",
        metrics.flat_recall_score(test_total_y_true,
                                  test_total_y_pred,
                                  average="binary"))

    print("[LOG] dumping results in ", config.dump_address)
    pickle.dump(
        np.array(total_scores_numpy_probs),
        open(os.path.join(config.dump_address, "dev_score_pobs.pkl"), "wb"))
    pickle.dump(
        np.array(total_labels_numpy_probs),
        open(os.path.join(config.dump_address, "dev_label_pobs.pkl"), "wb"))
    pickle.dump(
예제 #28
0
    def cross_validate(self, num_folds=10):
        """
        Performs k-fold stratified cross-validation using our model and pipeline.
        :param num_folds: number of folds to split training data into for cross validation
        :return: Prints out performance metrics
        """

        assert num_folds > 1, "Number of folds for cross validation must be greater than 1"

        assert self.model is not None, "Cannot cross validate a un-fit model"
        assert self.X_data is not None and self.y_data is not None, \
            "Must have features and labels extracted for cross validation"

        X_data = self.X_data
        Y_data = self.y_data

        medacy_pipeline = self.pipeline

        cv = SequenceStratifiedKFold(folds=num_folds)

        named_entities = medacy_pipeline.entities

        evaluation_statistics = {}
        fold = 1
        for train_indices, test_indices in cv(X_data, Y_data):
            fold_statistics = {}
            learner_name, learner = medacy_pipeline.get_learner()

            X_train = [X_data[index] for index in train_indices]
            y_train = [Y_data[index] for index in train_indices]

            X_test = [X_data[index] for index in test_indices]
            y_test = [Y_data[index] for index in test_indices]

            logging.info("Training Fold %i", fold)
            learner.fit(X_train, y_train)
            y_pred = learner.predict(X_test)

            for label in named_entities:
                fold_statistics[label] = {}
                recall = metrics.flat_recall_score(y_test,
                                                   y_pred,
                                                   average='weighted',
                                                   labels=[label])
                precision = metrics.flat_precision_score(y_test,
                                                         y_pred,
                                                         average='weighted',
                                                         labels=[label])
                f1 = metrics.flat_f1_score(y_test,
                                           y_pred,
                                           average='weighted',
                                           labels=[label])
                fold_statistics[label]['precision'] = precision
                fold_statistics[label]['recall'] = recall
                fold_statistics[label]['f1'] = f1

            # add averages
            fold_statistics['system'] = {}
            recall = metrics.flat_recall_score(y_test,
                                               y_pred,
                                               average='weighted',
                                               labels=named_entities)
            precision = metrics.flat_precision_score(y_test,
                                                     y_pred,
                                                     average='weighted',
                                                     labels=named_entities)
            f1 = metrics.flat_f1_score(y_test,
                                       y_pred,
                                       average='weighted',
                                       labels=named_entities)
            fold_statistics['system']['precision'] = precision
            fold_statistics['system']['recall'] = recall
            fold_statistics['system']['f1'] = f1

            evaluation_statistics[fold] = fold_statistics
            fold += 1

        statistics_all_folds = {}

        for label in named_entities + ['system']:
            statistics_all_folds[label] = {}
            statistics_all_folds[label]['precision_average'] = mean([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_max'] = max([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['precision_min'] = min([
                evaluation_statistics[fold][label]['precision']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['recall_average'] = mean([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_max'] = max([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['recall_min'] = min([
                evaluation_statistics[fold][label]['recall']
                for fold in evaluation_statistics
            ])

            statistics_all_folds[label]['f1_average'] = mean([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_max'] = max([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])
            statistics_all_folds[label]['f1_min'] = min([
                evaluation_statistics[fold][label]['f1']
                for fold in evaluation_statistics
            ])

        table_data = [[
            label,
            format(statistics_all_folds[label]['precision_average'], ".3f"),
            format(statistics_all_folds[label]['recall_average'], ".3f"),
            format(statistics_all_folds[label]['f1_average'], ".3f"),
            format(statistics_all_folds[label]['f1_min'], ".3f"),
            format(statistics_all_folds[label]['f1_max'], ".3f")
        ] for label in named_entities + ['system']]

        logging.info(
            tabulate(table_data,
                     headers=[
                         'Entity', 'Precision', 'Recall', 'F1', 'F1_Min',
                         'F1_Max'
                     ],
                     tablefmt='orgtbl'))
    def gen_model(self, x_train, y_train, x_test, y_test):

        for i in range(len(y_train)):
            for j in range(len(y_train[i])):
                y_train[i][j] = y_train[i][j].replace('B-', '')
                y_train[i][j] = y_train[i][j].replace('O-', '')
                y_train[i][j] = y_train[i][j].replace('I-', '')

        for i in range(len(y_test)):
            for j in range(len(y_test[i])):
                y_test[i][j] = y_test[i][j].replace('B-', '')
                y_test[i][j] = y_test[i][j].replace('O-', '')
                y_test[i][j] = y_test[i][j].replace('I-', '')

        labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO']
        # labels = ['O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO']
        # labels = ['m', 'r', 'f', 'do', 'du', 'mo']
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   max_iterations=100,
                                   all_possible_transitions=True)
        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05),
        }

        # use the same metric for evaluation
        f1_scorer = make_scorer(metrics.flat_f1_score,
                                average='weighted',
                                labels=labels)

        # search
        rand_search = RandomizedSearchCV(crf,
                                         params_space,
                                         cv=3,
                                         verbose=1,
                                         n_jobs=-1,
                                         n_iter=50,
                                         scoring=f1_scorer)
        rand_search.fit(x_train, y_train)

        crf = rand_search.best_estimator_

        y_prediction = crf.predict(x_test)

        # group B and I results
        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

        joblib.dump(crf, 'model.pkl')

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='micro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='micro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='micro')

        print('MICRO')
        print(precision, recall, f1)

        precision = metrics.flat_precision_score(y_test,
                                                 y_prediction,
                                                 labels=sorted_labels,
                                                 average='macro')
        recall = metrics.flat_recall_score(y_test,
                                           y_prediction,
                                           labels=sorted_labels,
                                           average='macro')
        f1 = metrics.flat_f1_score(y_test,
                                   y_prediction,
                                   labels=sorted_labels,
                                   average='macro')

        print('MACRO')
        print(precision, recall, f1)

        return metrics.flat_classification_report(y_test,
                                                  y_prediction,
                                                  labels=sorted_labels,
                                                  digits=3)
예제 #30
0
def test_flat_precision():
    score = metrics.flat_precision_score(y1, y2, average='micro')
    assert score == 3 / 5
예제 #31
0
                           max_iterations=100,
                           all_possible_transitions=True)
crf.fit(X_train, y_train)

labels = list(crf.classes_)
# labels

# In[67]:

y_pred = crf.predict(X_test)
overall_f1 = metrics.flat_f1_score(y_test,
                                   y_pred,
                                   average='weighted',
                                   labels=labels)
overall_prec = metrics.flat_precision_score(y_test,
                                            y_pred,
                                            average='weighted',
                                            labels=labels)
overall_recall = metrics.flat_recall_score(y_test,
                                           y_pred,
                                           average='weighted',
                                           labels=labels)

print("Overall F1:", overall_f1)
print("Overall Precision:", overall_prec)
print("Overall Recall", overall_recall)

# Inspect per-class results in more detail:

print(
    metrics.flat_classification_report(y_test, y_pred, labels=labels,
                                       digits=3))