Пример #1
0
    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr']  # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(
                train_lex[i], s['win']
            )  #list of list of indexes corresponding to context windows surrounding each word in the sentence
            words = map(lambda x: numpy.asarray(x).astype('int32'),
                        minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(
                    words, features, labels):
                rnn.train(word_batch, feature_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
Пример #2
0
    # create a folder for store the models
    if not os.path.exists(model_folder): os.mkdir(model_folder)

    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr'] # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(train_lex[i], s['win'])
            words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels   = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(words, features, labels):
                rnn.train(word_batch, feature_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [map(lambda x: idx2label[x],
Пример #3
0
                ne=vocsize,
                de=s['emb_dimension'],
                cs=s['win'])

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in range(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in range(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                         minibatch(cwords, s['bs']))
            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print(
                    '[learning] epoch {} >> {:2.2f}, completed in {:.2f} (sec) '
                    .format(e, (i + 1) * 100. / nsentences,
                            time.time() - tic))

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                             rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\
Пример #4
0
def play_with_spelling():
    """Play with spelling mistakes"""
    print CONF
    np.random.seed(CONF['seed'])
    random.seed(CONF['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed'])
    sentences = get_sentences(session_files)
    print len(sentences)
    labels2idx = char2idx = get_char_to_idx(sentences)

    print "Prepare train, validation and test sets"
    train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed'])
    train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed'])
    print len(train_valid_sentences), len(test_sentences)
    test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx)
    valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx)
    train_lex = []
    train_y = []
    for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0):
        _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx)
        train_lex.extend(_train_idxes)
        train_y.extend(_train_labels_idxes)
#     train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed'])
    print len(train_lex), len(valid_lex), len(train_y), len(valid_y)

    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary
    groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
    windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex]
    windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex]

    words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]
    groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
    words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist))
    nsentences = len(train_lex)

    words_lex = []
    for i in xrange(nsentences):
        cwords = contextwin(train_lex[i], CONF['win'])
        words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])]
        words_lex.append(words)

    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=CONF['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=CONF['emb_dimension'],
                        cs=CONF['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    CONF['current_learning_rate'] = CONF['learning_rate']
    print "Start training"
    start_time = print_time = time.time()
    for epoch in xrange(CONF['nepochs']):
        # shuffle
        shuffle([words_lex, train_y], CONF['seed'])
        CONF['ce'] = epoch
        tic = time.time()
        percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs']
        numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train)
        print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train
        test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train)
        print "test_size", test_size
        validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train)
        print "validation_size", validation_size
        for _ in xrange(30): # Trauma!
            print "_", _
            for i in xrange(numer_of_sentences_to_train):
                words = words_lex[i]
                labels = train_y[i]
                for word_batch, label_last_word in zip(words, labels):
                    rnn.train(word_batch, label_last_word, CONF['current_learning_rate'])
                    rnn.normalize()
                if CONF['verbose'] and time.time() - print_time > 30:
                    print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                    print_time = time.time()            

        # evaluation // back into the real world : idx -> words
        if CONF['verbose']:
            print "Classify test"
        predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)]
                            for windowed_test_lex_item in windowed_test_lex[:test_size]]

        if CONF['verbose']:
            print "Classify validation"
        predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)]
                             for windowed_valid_lex_item in windowed_valid_lex[:validation_size]]
        # evaluation // compute the accuracy using conlleval.pl
        if CONF['verbose']:
            print "Evaluate test and validation"
        res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r']
            CONF['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20
#             rnn.load(folder)

        # learning rate decay if no improvement in 10 epochs
        if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10:
            CONF['current_learning_rate'] *= 0.5
        if CONF['current_learning_rate'] < 1e-5:
            break

    print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
Пример #5
0
def main():
    settings = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': 1,
        'decay': False,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of backprop through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 100,  # dimension of word embedding
        'nepochs': 50
    }

    folder = os.path.basename(__file__).split('.')[0]

    if not os.path.exists(folder):
        os.mkdir(folder)

    # load the dataset
    train_set, valid_set, test_set, dic = load.atisfold(settings['fold'])
    idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
    idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex,  test_ne,  test_y = test_set

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    # instantiate the model
    numpy.random.seed(settings['seed'])
    random.seed(settings['seed'])

    if LOAD:
        print "Loading model from %s..." % folder

        rnn = ElmanRNNModel.load(folder)
    else:
        rnn = ElmanRNNModel(
            hidden_dims=settings['nhidden'],
            num_classes=nclasses,
            vocab_size=vocsize,
            embed_dims=settings['emb_dimension'],
            context_size=settings['win']
        )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    settings['current_lr'] = settings['lr']
    for e in xrange(settings['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], settings['seed'])
        settings['current_epoch'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], settings['win'])

            words = map(
                lambda x: numpy.asarray(x).astype('int32'),
                minibatch(cwords, settings['bs'])
            )

            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, settings['current_lr'])
                rnn.normalize()

            if settings['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [
            map(lambda x: idx2label[x],
                rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32')))
            for x in test_lex
        ]

        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ]

        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [
            map(
                lambda idx: idx2label[idx],
                rnn.classify(
                    numpy.asarray(contextwin(x, settings['win'])).astype('int32'))
            )
            for x in valid_lex
        ]

        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]

        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            if settings['verbose']:
                print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20
            settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'],  res_test['p'],  res_test['r']
            settings['be'] = e
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print ''

        # learning rate decay if no improvement in 10 epochs
        if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10:
            settings['current_lr'] *= 0.5

        if settings['current_lr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
Пример #6
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = {  # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 15,  # number of characters in the context window
        'bs': 5,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 30,  # dimension of character embedding
        'nepochs': 10
    }
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(
        number_of_files=number_of_files,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(
            np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(
            np.fromiter((char2idx[char] for char in sentence_out),
                        dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)
    print "Some more prep"
    idx2label = dict(
        (k, v) for v, k in labels2idx.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in char2idx.iteritems())  # Reverse the dictionary

    #     vocsize = 1 + len(set(reduce(\
    #                                  lambda x, y: list(x)+list(y),\
    #                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(
        set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex
            for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(
        nh=conf['nhidden'],
        nc=nclasses,
        ne=vocsize,
        de=conf['emb_dimension'],
        cs=conf['win'],
    )

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf[
        'be'], 'valid F1', best_f1, 'best test F1', conf[
            'tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
Пример #7
0
def prepare_data():
    """Prepare the data"""
    conf = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': True,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 300,  # dimension of word embedding
        'nepochs': 50
    }
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(
        number_of_files=None,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(
            np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
        #         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(
            np.fromiter(
                (words2idx.get(token, unknown) for token in token_list),
                dtype=np.int32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)

    idx2label = dict(
        (k, v) for v, k in LABELS2IDX.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in words2idx.iteritems())  # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME,
                                             binary=True)  # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(
        -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(
            theano.config.floatX
        )  # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx]  # Keep it random
        embeddings[idx] = embedding

    del word2vec  # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                    nc=nclasses,
                    ne=vocsize,
                    de=conf['emb_dimension'],
                    cs=conf['win'],
                    embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])


#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[
        'f1'], 'best test F1', res_test['f1'], 'with the model', folder
Пример #8
0
                    ne = vocsize,
                    de = s['emb_dimension'],
                    cs = s['win'] )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                         minibatch(cwords, s['bs']))
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, s['clr'])
                rnn.normalize()
            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic),
                sys.stdout.flush()
            
        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                             rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\
                             for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]
Пример #9
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = { # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True, # decay on the learning rate if improvement stops
        'win': 15, # number of characters in the context window
        'bs': 5, # number of back-propagation through time steps
        'nhidden': 100, # number of hidden units
        'seed': 345,
        'emb_dimension': 30, # dimension of character embedding
        'nepochs': 10}
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)
    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary

#     vocsize = 1 + len(set(reduce(\
#                                  lambda x, y: list(x)+list(y),\
#                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=conf['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=conf['emb_dimension'],
                        cs=conf['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
Пример #10
0
def prepare_data():
    """Prepare the data"""
    conf = {'fold': 3, # 5 folds 0,1,2,3,4
            'lr': 0.0627142536696559,
            'verbose': True,
            'decay': True, # decay on the learning rate if improvement stops
            'win': 7, # number of words in the context window
            'bs': 9, # number of back-propagation through time steps
            'nhidden': 100, # number of hidden units
            'seed': 345,
            'emb_dimension': 300, # dimension of word embedding
            'nepochs': 50}
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
#         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32))



    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)

    idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx] # Keep it random
        embeddings[idx] = embedding

    del word2vec # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                nc=nclasses,
                ne=vocsize,
                de=conf['emb_dimension'],
                cs=conf['win'],
                embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder