Exemplo n.º 1
0
def main(opts):
    # set a seed for reproducible network
    random.seed(42)

    labels_map = opts.labels_map
    if not labels_map:
        # get a copy of the labels if not provided
        labels_map = dict(LABELS_MAP)

    if not opts.include_negative_samples:
        # pop out the "NONE" label if no negative
        # samples are provided
        labels_map.pop('NONE')
        labels_map = {k: v - 1 for k, v in labels_map.items()}

    # load the dataset
    dataset = pipeline.load_abstracts_relations(opts.subtask)

    # get list of all dependency tags used in the dataset
    dependencies_map = pipeline.get_dependencies_map(dataset)

    # get list of all pos tags used in the dataset
    pos_map = pipeline.get_part_of_speech_map(dataset)

    # split it by sentence, potentially include negative samples
    sentences_dataset = pipeline.split_dataset_into_sentences(
        *dataset, include_negative_samples=opts.include_negative_samples)

    # split sentences between train and test according to the
    # official dataset split
    train_sentences, validation_sentences = pipeline.split_train_test_sentences(
        opts.subtask, sentences_dataset)

    test_dataset = pipeline.load_abstracts_relations(opts.subtask,
                                                     load_test=True)
    test_sentences = pipeline.split_dataset_into_sentences(
        *test_dataset, include_negative_samples=opts.include_negative_samples)

    if opts.evaluate_output:
        evaluate_dataset = pipeline.load_abstracts_relations(opts.subtask,
                                                             load_test=True)
        evaluate_sentences_dataset = pipeline.split_dataset_into_sentences(
            *evaluate_dataset,
            include_negative_samples=opts.include_negative_samples)
    else:
        # so that static code analyzers don't freak out!
        evaluate_sentences_dataset = None

    # get distribution info for entities in training set
    ent_distr = pipeline.get_distribution_ent_length(train_sentences)

    # get the mxnet context (aka cpu or gpu) as
    # provided by the user. if none is provided, use cpu0
    context = mxnet_utils.get_context_from_string(opts.mxnet_context)

    # path to embeddings file in word2vec text format
    # as specified by the user
    embeddings_path = os.path.expanduser(
        EMBEDDINGS_PATHS[opts.embeddings_type])

    # download embeddings from google drive
    embeddings_path = download_embeddings(path, opts.emb_type)

    # execute mxnet operations accoring in specified context
    with context:
        # load embeddings and vocabulary
        vocabulary, embeddings = \
            mxnet_utils.word2vec_mxnet_embedding_initializer(
                embeddings_path, max_embeddings=opts.max_embeddings
            )

        # get training data; has to be executed after vocabulary and
        # embeddings (which need to be placed on the GPU if specified,
        # hence the context) are loaded.
        train_data = prepare_data_for_net(
            vocabulary,
            train_sentences,
            labels_map,
            dependencies_map=dependencies_map,
            pos_map=pos_map,
            include_entities_nodes=opts.include_entities_nodes,
            include_entities_children=opts.include_entities_children,
            entity_length_distribution=ent_distr,
            case_sensitive=opts.case_sensitive)

        # doing the same thing, but with test data
        test_data = prepare_data_for_net(
            vocabulary,
            test_sentences,
            labels_map,
            dependencies_map=dependencies_map,
            pos_map=pos_map,
            include_entities_children=opts.include_entities_children,
            include_entities_nodes=opts.include_entities_nodes,
            entity_length_distribution=ent_distr,
            case_sensitive=opts.case_sensitive)

        # doing the same thing, but with test data
        validation_data = prepare_data_for_net(
            vocabulary,
            validation_sentences,
            labels_map,
            dependencies_map=dependencies_map,
            pos_map=pos_map,
            include_entities_children=opts.include_entities_children,
            include_entities_nodes=opts.include_entities_nodes,
            entity_length_distribution=ent_distr,
            case_sensitive=opts.case_sensitive)

        # get stats abt average size of parse tree
        parse_tree_lengths = [
            len(t) for _, _, t, *_ in itertools.chain(train_data, test_data)
        ]
        print('[info] parse tree length: {:.2f} +/- {:.2f}'.format(
            np.mean(parse_tree_lengths), np.std(parse_tree_lengths)))

        max_tree_height = max(max(t[4]) for t in train_data) + 1
        max_tree_height = (max_tree_height
                           if 'height' in opts.extra_features else 0)

        dependencies_num = \
            len(dependencies_map) if 'dep' in opts.extra_features else 0

        pos_num = \
            len(pos_map) if 'pos' in opts.extra_features else 0

        include_ent_len = \
            True if 'ent-len' in opts.extra_features else False

        net = Net(embeddings,
                  len(labels_map),
                  dropout=opts.dropout,
                  trainable_embeddings=opts.trainable_embeddings,
                  dependencies_num=dependencies_num,
                  part_of_speech_num=pos_num,
                  include_ent_len=include_ent_len,
                  max_tree_height=max_tree_height)
        net.initialize()

        # loos and trainer initialized here
        softmax_cross_entropy_labels = mx.gluon.loss.SoftmaxCrossEntropyLoss()

        trainer = mx.gluon.Trainer(net.collect_params(), 'adam',
                                   {'learning_rate': opts.learning_rate})

        # object to calculate F1 metric for the dataset
        f1_score_class = mxnet_utils.F1Score(num_classes=len(labels_map))

        for epoch in range(1, opts.epochs + 1):

            # random.shuffle(train_data)

            cumulative_loss = total_steps = 0
            probs, labels = [], []

            for sample in tqdm.tqdm(train_data, desc='Epoch {}'.format(epoch)):
                with mx.autograd.record():
                    (
                        tokens,  # the tokens in sentence
                        deps,  # dependency tags
                        pos,  # part of speech tags
                        ent_lens,  # length of the input entities
                        dist_from_tree,  # distance from root of subtree
                        tree,  # the subtree
                        entities,  # indication for entity location
                        label  # the label for this sample
                    ) = sample

                    tokens = mx.nd.array(tokens)
                    entities = mx.nd.array(entities)
                    idx = mx.nd.array([tree.idx])
                    adj = mx.nd.array(tree.to_array())
                    deps = mx.nd.array(deps)
                    pos = mx.nd.array(pos)
                    ent_lens = mx.nd.array(ent_lens)
                    dist_from_tree = mx.nd.array(dist_from_tree)

                    out = net(tokens, deps, pos, ent_lens, dist_from_tree, adj,
                              entities, idx, True)

                    probs.append(out)
                    labels.append([label])

                if len(probs) == opts.batch_size:
                    total_steps += opts.batch_size

                    with mx.autograd.record():
                        probs = mx.nd.concat(*probs, dim=0)
                        labels = mx.nd.array(labels)
                        loss = softmax_cross_entropy_labels(probs, labels)

                        if opts.include_negative_samples:
                            factor = (mx.nd.argmax(probs, axis=1) == 0) * 9 + 1
                            loss = mx.nd.multiply(loss, factor)

                    loss.backward()
                    trainer.step(opts.batch_size)
                    cumulative_loss += mx.nd.sum(loss).asscalar()

                    pred_labels = mx.nd.argmax(probs, axis=1)
                    f1_score_class.update(preds=pred_labels, labels=labels)

                    probs, labels = [], []

            # get precision, recall, and F1 score for the two
            # subtasks on the training set for this epoch
            prec, recall, f1 = map(
                lambda arr: mx.nd.mean(arr).asscalar() * 100,
                f1_score_class.get())

            # also calculate average loss
            avg_loss = cumulative_loss / total_steps

            # print everything
            msg = ('Epoch {e} // training data // avg_loss={l:.4f}\n'
                   'Classification: P={p:.2f}  R={r:.2f}  F1={f:.2f}').format(
                       e=epoch, l=avg_loss, p=prec, r=recall, f=f1)
            print(msg)

            if opts.validate_every > 0 and epoch % opts.validate_every == 0:

                if opts.error_analysis_path:
                    p = '{}{}.{}.txt'.format(
                        os.path.splitext(opts.error_analysis_path)[0], 'val',
                        epoch)
                else:
                    p = None

                evaluate_on_test_data(
                    net,
                    validation_sentences,
                    validation_data,
                    labels_map,
                    output_for_error_analysis=p,
                )

            if opts.test_every > 0 and epoch % opts.test_every == 0:

                if opts.error_analysis_path:
                    p = '{}{}.{}.txt'.format(
                        os.path.splitext(opts.error_analysis_path)[0], 'test',
                        epoch)
                else:
                    p = None

                evaluate_on_test_data(
                    net,
                    test_sentences,
                    test_data,
                    labels_map,
                    output_for_error_analysis=p,
                )

        if opts.evaluate_output:
            evaluate_data = prepare_data_for_net(
                vocabulary,
                evaluate_sentences_dataset,
                labels_map,
                dependencies_map=dependencies_map,
                pos_map=pos_map,
                include_entities_children=opts.include_entities_children,
                include_entities_nodes=opts.include_entities_nodes,
                entity_length_distribution=ent_distr,
                case_sensitive=opts.case_sensitive)

            evaluate_on_test_data(net,
                                  evaluate_sentences_dataset,
                                  evaluate_data,
                                  labels_map,
                                  evaluate_output=opts.evaluate_output)
    graph = nx.DiGraph()

    with open(path) as f:
        rd = csv.reader(f)
        for k1, k2 in rd:
            graph.add_edge()


if __name__ == '__main__':
    subtask = '1.1'
    oth_path = '/home/ls/blue-hd/datasets/saffron-hierarchies-acl/saffron-ACL-cleaned.csv'

    nlp = spacy.load('en')
    customize_tokenizer(nlp)

    dataset = pipeline.load_abstracts_relations(subtask)

    # get list of all dependency tags used in the dataset
    dependencies_map = pipeline.get_dependencies_map(dataset)

    # get list of all pos tags used in the dataset
    pos_map = pipeline.get_part_of_speech_map(dataset)

    # split it by sentence, potentially include negative samples
    sentences_dataset = pipeline.split_dataset_into_sentences(*dataset)

    # split sentences between train and test according to the
    # official dataset split
    train_sentences, _ = pipeline.split_train_test_sentences(
        subtask, sentences_dataset)