Exemplo n.º 1
0
def train(train_corpus: str,
          dev_corpus: str,
          c1: float = 0.0,
          c2: float = 0.0,
          algorithm: str = 'lbfgs',
          max_iterations: int = 100,
          all_possible_transitions: bool = False,
          window_size: int = 1,
          model_filename: str = None,
          _run: Run = None,
          _log: logger = None):
    """
    running crf experiment
    """
    _run.add_resource(train_corpus)
    _run.add_resource(dev_corpus)
    train_sents, _ = get_tagged_sents_and_words(train_corpus)
    dev_sents, _ = get_tagged_sents_and_words(dev_corpus)

    X_train = [sent2features(s, window_size) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_dev = [sent2features(s, window_size) for s in dev_sents]
    y_dev = [sent2labels(s) for s in dev_sents]

    crf = sklearn_crfsuite.CRF(
        algorithm=algorithm,
        c1=c1,
        c2=c2,
        max_iterations=max_iterations,
        all_possible_transitions=all_possible_transitions,
        model_filename=model_filename,
    )

    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_dev)
    overall, by_type = evaluate(y_dev, y_pred)
    _run.info[f'overall_f1'] = overall.f1_score
    _run.log_scalar('overall_f1', overall.f1_score)
    _run.info[f'overall_precision'] = overall.precision
    _run.log_scalar('overall_precision', overall.precision)
    _run.info[f'overall_recall'] = overall.recall
    _run.log_scalar('overall_recall', overall.recall)
    _log.info(f'Overall F1 score: {overall.f1_score}')
    for _, key in enumerate(sorted(by_type.keys())):
        for metric_key in by_type[key]._fields:
            metric_val = getattr(by_type[key], metric_key)
            _run.info[f'{key}-{metric_key}'] = metric_val
            _run.log_scalar(f'{key}-{metric_key}', metric_val)
            _log.info(f'{key}-{metric_key}: {metric_val}')
    if model_filename is not None:
        _log.info(f'saving to: {model_filename}.pkl')
        joblib.dump(crf, f'{model_filename}.pkl')
        _run.add_artifact(f'{model_filename}.pkl')
Exemplo n.º 2
0
def tag_to_json(tagger, text, sep="\n", window_size=0):
    annotations = {}

    def _add_ann(start, end, _type):
        annotations[len(annotations)] = {
            'type': _type,
            'offsets': ((start, end), ),
            'texts': ((text[start:end]), ),
        }
    print(text)
    text = text.decode("utf-8")
    print(SEP)
    data = text.split(SEP)
    # data = re.split(SEP, text)
    # data = text.split(sep)
    data = [x for x in data if x]
    length = 0
    for sent in data:
        print("sent : ", sent)
        x_feat = sent2features(word_tokenize(sent), window_size)
        result = tagger.predict(x_feat)
        result = tag(sent, sent.split(), result)
        print(result)
        for span in result:
            if span["tagname"] != "O":
                start = length + int(span["start"])
                end = length + int(span["end"])
                # print(start)
                # print(end)
                # print(text[start:end])
                _add_ann(start, end, span["tagname"])
        length += len(sent+sep)

    return annotations
Exemplo n.º 3
0
def test(model_filename: str,
         test_corpus: str,
         window_size: int = 5,
         _run: Run = None,
         _log: logger = None):
    _run.add_resource(test_corpus)
    _run.add_resource(f'{model_filename}.pkl')
    test_sents, _ = get_tagged_sents_and_words(test_corpus)

    X_test = [sent2features(s, window_size) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    _log.info(f'load from: {model_filename}.pkl')
    crf = sklearn_crfsuite.CRF(model_filename=model_filename)

    y_pred = crf.predict(X_test)
    overall, by_type = evaluate(y_test, y_pred)
    _run.info[f'overall_f1'] = overall.f1_score
    _run.log_scalar('overall_f1', overall.f1_score)
    _run.info[f'overall_precision'] = overall.precision
    _run.log_scalar('overall_precision', overall.precision)
    _run.info[f'overall_recall'] = overall.recall
    _run.log_scalar('overall_recall', overall.recall)
    _log.info(f'Overall F1 score: {overall.f1_score}')
    for _, key in enumerate(sorted(by_type.keys())):
        for metric_key in by_type[key]._fields:
            metric_val = getattr(by_type[key], metric_key)
            _run.info[f'{key}-{metric_key}'] = metric_val
            _run.log_scalar(f'{key}-{metric_key}', metric_val)
            _log.info(f'{key}-{metric_key}: {metric_val}')
Exemplo n.º 4
0
def print_corpus(corpus: List,
                 labels: List,
                 fileout: str,
                 window_size: int = 0):
    for tagged_sent in corpus:
        feats = sent2features(tagged_sent, window_size)
        tags = sent2partial_labels(tagged_sent, labels=labels)
        for tag, feature in zip(tags, feats):
            feature = ['{}={}'.format(k, v) for k, v in feature.items()]
            print('{}\t{}'.format(tag, '\t'.join(feature)), file=fileout)
        print('', file=fileout)
    logging.info(f"print corpus to {fileout.name}")
Exemplo n.º 5
0
                        help='window sizes')
    parser.add_argument('--features', default='default', choices=['default', 'stanford'], \
                        type=str, help='features prep, default, or stanford-ner')
    parser.add_argument('--encoding', default='utf-8', help='file encoding')
    args = parser.parse_args()

    pre = ['B', 'I']
    ent = ['Person', 'Place', 'Organisation']

    labels = [f'{x}-{y}' for x in pre for y in ent] + ['O']
    # print(labels)

    corpus, _ = get_tagged_sents_and_words(args.file)
    for tagged_sent in corpus:
        if args.features == 'default':
            feats = sent2features(tagged_sent, args.window_size)
            tags = sent2partial_labels(tagged_sent, labels=labels)
        elif args.features == 'stanford':
            feats = sent2stanfordfeats(tagged_sent)
            tags = sent2stanford_partial(tagged_sent, labels=labels)
        for tag, feature in zip(tags, feats):
            # feature = ['{}={}'.format(k, v) for k, v in feature.items()]
            feat = []
            for k, v in feature.items():
                if k.split(':')[0].isdigit():
                    weight_name = k.split(':')
                    feat.append('{}={}:{}'.format(
                        ''.join(weight_name[1:]).strip(':'), v,
                        weight_name[0]))
                else:
                    feat.append('{}={}'.format(k.strip(':'), v))