示例#1
0
def test_ignore_ns():
    feat_map = DictFeatMap()

    seqs, labels = read_vw_seq(vw_filename("ns.vw"), feat_map)
    assert b"1^a" in feat_map.feat2index_

    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename("ns.vw"), feat_map, ignore=["1"])
    assert b"1^a" not in feat_map.feat2index_

    # Longer namespaces
    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename("ns.vw"), feat_map, ignore=["3"])
    assert not any(key.startswith(b"3xx") for key in feat_map.feat2index_.keys())
示例#2
0
def test_ignore_ns():
    feat_map = DictFeatMap()

    seqs, labels = read_vw_seq(vw_filename('ns.vw'), feat_map)
    assert '1^a' in feat_map.feat2index_

    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename('ns.vw'), feat_map, ignore=['1'])
    assert '1^a' not in feat_map.feat2index_

    # Longer namespaces
    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename('ns.vw'), feat_map, ignore=['3'])
    assert not any(key.startswith('3xx') for key in feat_map.feat2index_.keys())
示例#3
0
def test_ignore_ns():
    feat_map = DictFeatMap()

    seqs, labels = read_vw_seq(vw_filename('ns.vw'), feat_map)
    assert b'1^a' in feat_map.feat2index_

    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename('ns.vw'), feat_map, ignore=['1'])
    assert b'1^a' not in feat_map.feat2index_

    # Longer namespaces
    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename('ns.vw'), feat_map, ignore=['3'])
    assert not any(
        key.startswith(b'3xx') for key in feat_map.feat2index_.keys())
示例#4
0
def test_importance_weights():
    seqs, labels = read_vw_seq(vw_filename('importance.vw'), DictFeatMap())

    eq_(len(seqs), 1, "One example")
    eq_(len(seqs[0]), 2, "Two tokens")

    imp1, imp2 = seqs[0].importance_weights
    eq_(imp1, 2.0)
    eq_(imp2, 1.0)
示例#5
0
def test_importance_weights():
    seqs, labels = read_vw_seq(vw_filename('importance.vw'), DictFeatMap())

    eq_(len(seqs), 1, "One example")
    eq_(len(seqs[0]), 2, "Two tokens")

    imp1, imp2 = seqs[0].importance_weights
    eq_(imp1, 2.0)
    eq_(imp2, 1.0)
示例#6
0
def test_weighted_features():
    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename('weighted.vw'), feat_map)

    expected = {'1^a': 3, '1^b': -3, '1^c': 2.5, '1^d': 1, '1^e': 1E6}

    # Map feature ids to names
    index2feat = {idx: feat_name for feat_name, idx in feat_map.feat2index_.items()}
    lookup = {index2feat[index]: val for index, val in seqs[0].features}

    for key, val in expected.items():
        assert key in lookup
        eq_(lookup[key], val)
示例#7
0
def test_weighted_features():
    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename("weighted.vw"), feat_map)

    expected = {b"1^a": 3, b"1^b": -3, b"1^c": 2.5, b"1^d": 1, b"1^e": 1e6}

    # Map feature ids to names
    index2feat = {idx: feat_name for feat_name, idx in feat_map.feat2index_.items()}
    lookup = {index2feat[index]: val for index, val in seqs[0].features}

    for key, val in expected.items():
        assert key in lookup
        eq_(lookup[key], val)
示例#8
0
def test_weighted_features():
    feat_map = DictFeatMap()
    seqs, labels = read_vw_seq(vw_filename('weighted.vw'), feat_map)

    expected = {b'1^a': 3, b'1^b': -3, b'1^c': 2.5, b'1^d': 1, b'1^e': 1E6}

    # Map feature ids to names
    index2feat = {
        idx: feat_name
        for feat_name, idx in feat_map.feat2index_.items()
    }
    lookup = {index2feat[index]: val for index, val in seqs[0].features}

    for key, val in expected.items():
        assert key in lookup
        eq_(lookup[key], val)
示例#9
0
def test_cs_weights():
    seqs, labels = read_vw_seq(vw_filename('cs.vw'), DictFeatMap())
    eq_(len(seqs), 1)
    eq_(set(labels), set(['A', 'B', 'C']))

    label_costs = seqs[0].label_costs

    # First token
    token_a = label_costs[0]
    eq_(labels[token_a[0][0]], 'C')
    eq_(token_a[0][1], 0.5)

    # Second token
    token_b = label_costs[1]
    eq_(labels[token_b[0][0]], 'A')
    eq_(token_b[0][1], 0.1)

    eq_(labels[token_b[1][0]], 'B')
    eq_(token_b[1][1], 0.2)
示例#10
0
def test_cs_weights():
    seqs, labels = read_vw_seq(vw_filename('cs.vw'), DictFeatMap())
    eq_(len(seqs), 1)
    eq_(set(labels), set([b'A', b'B', b'C']))

    label_costs = seqs[0].label_costs

    # First token
    token_a = label_costs[0]
    eq_(labels[token_a[0][0]], b'C')
    eq_(token_a[0][1], 0.5)

    # Second token
    token_b = label_costs[1]
    eq_(labels[token_b[0][0]], b'A')
    eq_(token_b[0][1], 0.1)

    eq_(labels[token_b[1][0]], b'B')
    eq_(token_b[1][1], 0.2)
示例#11
0
def test_cs_weights():
    seqs, labels = read_vw_seq(vw_filename("cs.vw"), DictFeatMap())
    eq_(len(seqs), 1)
    eq_(set(labels), set([b"A", b"B", b"C"]))

    label_costs = seqs[0].label_costs

    # First token
    token_a = label_costs[0]
    eq_(labels[token_a[0][0]], b"C")
    eq_(token_a[0][1], 0.5)

    # Second token
    token_b = label_costs[1]
    eq_(labels[token_b[0][0]], b"A")
    eq_(token_b[0][1], 0.1)

    eq_(labels[token_b[1][0]], b"B")
    eq_(token_b[1][1], 0.2)
示例#12
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    parser = argparse.ArgumentParser(description="""Structured perceptron tagger.""")
    parser.add_argument('--train', help="Training data (vw format).")
    parser.add_argument('--test', help="Test data (vw format).")
    parser.add_argument('--hash-bits', '-b', help="Size of feature vector in bits (2**b).", type=int)
    parser.add_argument('--passes', help="Number of passes over the training set.", type=int, default=5)
    parser.add_argument('--predictions', '-p', help="File for outputting predictions.")
    parser.add_argument('--ignore', help="One-character prefix of namespaces to ignore.", nargs='*', default=[])
    parser.add_argument('--quadratic', '-q', help="Combine features in these two namespace, identified by a one-character prefix of their name"
                                                  "':' is a short-hand for all namespaces.", nargs='*', default=[])
    parser.add_argument('--no-average', help="Do not average over all updates.", action='store_false',
                        dest='average', default=True)
    parser.add_argument('--no-ada-grad', help="Do not use adaptive gradient scaling.",
                        action='store_false', dest='ada_grad', default=True)
    parser.add_argument('--initial-model', '-i', help="Initial model from this file.")
    parser.add_argument('--base-weights', help="Use initial model as base weights.", action='store_true')
    parser.add_argument('--final-model', '-f', help="Save model here after training.")
    parser.add_argument('--cost-sensitive', '--cs', help="Cost-sensitive weight updates", action='store_true')
    parser.add_argument('--l2-decay', help="Shrink weights by this factor after each update.", type=float)
    parser.add_argument('--append-test', help="Append test result as JSON object to this file.")
    parser.add_argument('--audit', help="Print the interpretation of the input files to standard out. "
                                        "Useful for debugging. ", action='store_true')
    parser.add_argument('--name', help="Identify this invocation by NAME (use in conjunction with --append-test).")
    parser.add_argument('--labels', help="Read the set of labels from this file.")
    parser.add_argument('--drop-out', help="Regularize by randomly removing features (with probability 0.1).", action='store_true')
    parser.add_argument('--confusion-scaling', help="Scale updates by the values found by the rectangular matrix C.\n"
                                                    "With gold tag i and prediction j, the scaling is C[i, j]. "
                        "The matrix should be formatted as a CSV file where rows and columns are labels")


    args = parser.parse_args()

    if not (args.train or args.test):
        print("Error: Must specify either a training or a test file", file=sys.stderr)
        parser.print_usage()
        exit(1)


    timers = defaultdict(lambda: Timer())
    logging.info("Tagger started. \nCalled with {}".format(args))

    if args.hash_bits:
        feat_map = HashingFeatMap(args.hash_bits)
    else:
        feat_map = DictFeatMap()

    weight_updater = update_weights
    if args.cost_sensitive:
        weight_updater = update_weights_cs_sample

    if args.labels:
        labels = [line.strip() for line in open(args.labels)]
    else:
        labels = None

    if args.l2_decay:
        WV = ScaledWeightVector
    else:
        WV = WeightVector

    if args.initial_model:
        wt = WV.load(join(args.initial_model, 'transition.npz'), l2_decay=args.l2_decay)
        we = WV.load(join(args.initial_model, 'emission.npz'), l2_decay=args.l2_decay)

        labels = list(np.load(join(args.initial_model, 'labels.npy')))
        if not args.hash_bits:
            pickle_filename = join(args.initial_model, 'feature_map.pickle')
            feat_map.feat2index_ = pickle.load(open(pickle_filename, 'rb'))

        if args.base_weights:
            wt.base = wt.w
            wt.w = np.zeros_like(wt.w)
            we.base = we.w
            we.w = np.zeros_like(we.w)

    train = None
    if args.train:
        train, train_labels = read_vw_seq(args.train, ignore=args.ignore, quadratic=args.quadratic, feat_map=feat_map,
                                          labels=labels, audit=args.audit, require_labels=True)
        if args.initial_model:
            assert len(labels) == len(train_labels), \
                "Labels from training data not found in saved model".format(set(train_labels) - set(labels))
        labels = train_labels
        logging.info("Training data {} sentences {} labels".format(len(train), len(train_labels)))

    # Prevents the addition of new features when loading the test set
    feat_map.freeze()
    test = None
    if args.test:
        test, test_labels = read_vw_seq(args.test, ignore=args.ignore, quadratic=args.quadratic, feat_map=feat_map,
                                        labels=labels, audit=args.audit)
        if args.initial_model:
            assert len(labels) == len(test_labels), \
                "Labels from test data not found in saved model: {}".format(set(test_labels) - set(labels))
        labels = test_labels
        logging.info("Test data {} sentences {} labels".format(len(test), len(test_labels)))

    n_labels = len(labels)
    if not args.hash_bits:
        feat_map.n_labels = n_labels

    # Loading weights
    if not args.initial_model:
        wt = WV((n_labels + 2, n_labels + 2),
                          ada_grad=args.ada_grad, l2_decay=args.l2_decay)
        we = WV(feat_map.n_feats(),
                          ada_grad=args.ada_grad, l2_decay=args.l2_decay)

    logging.info("Weight vector sizes. Transition={}. Emission={}".format(wt.dims, we.dims))

    # Load confusion scaling dataset
    if args.confusion_scaling:
        confusion_scaling_pd = pd.read_csv(args.confusion_scaling, index_col=0, encoding='utf-8')
        assert (confusion_scaling_pd.index == confusion_scaling_pd.columns).all(), "Confusion scaling matrix should be square and have identical row and column names"
        confusion_scaling = confusion_scaling_pd.ix[labels, labels].fillna(1).values
        logging.info("Confusion scaling. Matrix specifies {} overlapping labels with mean scaling factor {:.3f}".format(
            len(set(labels) & set(confusion_scaling_pd.index)),
            confusion_scaling.mean()))
        weight_updater = update_weights_confusion


    # Corruption
    if args.train and args.drop_out:
        corrupter = FastBinomialCorruption(0.1, feat_map, n_labels)
        # corrupter = RecycledDistributionCorruption(inverse_zipfian_sampler, feat_map, n_labels)
        # corrupter = AdversialCorruption(0.1, feat_map, n_labels)

    # Only one decoder supported at the moment
    Viterbi = ViterbiStd

    def do_train(transition, emission):
        vit = Viterbi(n_labels, transition, emission, feat_map)

        timers['train'].begin()
        n_updates = 0
        for epoch in range(1, args.passes+1):
            for sent in train:
                if args.drop_out:
                    corrupter.corrupt_sequence(sent, emission, transition)
                vit.decode(sent)
                if args.confusion_scaling:
                    weight_updater(sent, transition, emission, 0.1, n_labels, feat_map, confusion_scaling)
                else:
                    weight_updater(sent, transition, emission, 0.1, n_labels, feat_map)

                n_updates += 1
                transition.update_done()
                emission.update_done()
                # print("Scaling", emission.scaling)

                if n_updates % 1000 == 0:
                    print('\r[{}] {}k sentences total'.format(epoch, n_updates / 1000), file=sys.stderr)


            epoch_msg = "[{}] train loss={:.4f} ".format(epoch, avg_loss(train))
            print("\r{}{}".format(epoch_msg, " "*72), file=sys.stderr)

        timers['train'].end()

        # Rescale
        transition.rescale()
        emission.rescale()

        if args.average:
            transition.average()
            emission.average()

        tokens_trained = epoch * sum(len(seq) for seq in train)
        print("Training took {:.2f} secs. {} words/sec".format(timers['train'].elapsed(),
                                                                             int(tokens_trained / timers['train'].elapsed())),
              file=sys.stderr)
    def do_test(transition, emission):
        vit = Viterbi(n_labels, transition, emission, feat_map)

        timers['test'].begin()
        with open(args.predictions or os.devnull, 'w') as out:
            labels_map = dict((i, label_str) for i, label_str in enumerate(test_labels))
            for sent_i, sent in enumerate(test):
                if sent_i > 0:
                    print("", file=out)
                vit.decode(sent)
                for example_id, gold_label, pred_label in zip(sent.ids, sent.gold_labels, sent.pred_labels):
                    print("{}\t{}\t{}".format(example_id, labels_map.get(gold_label, 'UNKNOWN'), labels_map[pred_label]), file=out)


        timers['test'].end()
        logging.info("Accuracy: {:.3f}".format(accuracy(test)))
        print("Test took {:.2f} secs. {} words/sec".format(timers['test'].elapsed(),
                                                                             int(sum(len(seq) for seq in test) / timers['test'].elapsed())),
              file=sys.stderr)

        if args.append_test:
            with open(args.append_test, 'a') as result_file:
                result = {'accuracy': accuracy(test), 'name': args.name}
                result.update(args.__dict__)
                json.dump(result, result_file)
                print("", file=result_file)


    # Training
    if args.train:
        do_train(wt, we)
        pass
        # Ensemble training
        # if args.drop_out:
        #     wt_total = np.zeros_like(wt.w)
        #     we_total = np.zeros_like(we.w)
        #     for i in range(25):
        #         dropout_groups(train, group_sizes)
        #         wt_round = wt.copy()
        #         we_round = we.copy()
        #
        #         do_train(wt_round, we_round)
        #         we_total += we_round.w
        #         wt_total += wt_round.w
        #
        #     wt = WeightVector(wt.dims, w=wt_total)
        #     we = WeightVector(we.dims, w=we_total)




    # Testing
    if args.test:
        do_test(wt, we)

    # Save model
    if args.final_model:
        if not exists(args.final_model):
            os.makedirs(args.final_model)

        wt.save(join(args.final_model, 'transition.npz'))
        we.save(join(args.final_model, 'emission.npz'))
        np.save(join(args.final_model, 'labels'), labels)
        json.dump(args.__dict__, open(join(args.final_model, 'settings.json'), 'w'))

        if not args.hash_bits:
            with open(join(args.final_model, 'feature_map.pickle'), 'wb') as out:
                pickle.dump(feat_map.feat2index_, out)