def main(
    model_dir=None,
    train_dir=None,
    dev_dir=None,
    is_runtime=False,
    nr_hidden=64,
    max_length=100,  # Shape
    dropout=0.5,
    learn_rate=0.001,  # General NN config
    nb_epoch=5,
    batch_size=256,
    nr_examples=-1,
):  # Training params
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        model_dir.mkdir(parents=True, exist_ok=True)
        (model_dir / "vocab").mkdir(parents=True, exist_ok=True)
    if train_dir is None or dev_dir is None:
        imdb_data = thinc.extra.datasets.imdb()
    if is_runtime:
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
        else:
            dev_texts, dev_labels = read_data(dev_dir)
        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
        print(acc)
    else:
        if train_dir is None:
            train_texts, train_labels = zip(*imdb_data[0])
        else:
            print("Read data")
            train_texts, train_labels = read_data(train_dir, limit=nr_examples)
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
        else:
            dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
        train_labels = numpy.asarray(train_labels, dtype="int32")
        dev_labels = numpy.asarray(dev_labels, dtype="int32")
        lstm = train(
            train_texts,
            train_labels,
            dev_texts,
            dev_labels,
            {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
            {"dropout": dropout, "lr": learn_rate},
            {},
            nb_epoch=nb_epoch,
            batch_size=batch_size,
            persist_vocab_path=model_dir / "vocab"
        )
        weights = lstm.get_weights()
        if model_dir is not None:
            model_dir.mkdir(parents=True, exist_ok=True)
            with (model_dir / "model").open("wb") as file_:
                pickle.dump(weights[1:], file_)
            with (model_dir / "config.json").open("w") as file_:
                file_.write(lstm.to_json())
def main(
        model_dir=None,
        train_dir=None,
        dev_dir=None,
        is_runtime=False,
        nr_hidden=64,
        max_length=100,  # Shape
        dropout=0.5,
        learn_rate=0.001,  # General NN config
        nb_epoch=5,
        batch_size=256,
        nr_examples=-1):  # Training params
    if model_dir is not None:
        print('We set the model_dir as ' + model_dir)
        model_dir = pathlib.Path(model_dir)
    if train_dir is None or dev_dir is None:
        imdb_data = thinc.extra.datasets.imdb()
    if is_runtime:
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
        else:
            dev_texts, dev_labels = read_data(dev_dir)
        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
        print(acc)
    else:
        if train_dir is None:
            train_texts, train_labels = zip(*imdb_data[0])
        else:
            print("Read data")
            train_texts, train_labels = read_data(train_dir, limit=nr_examples)
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
        else:
            dev_texts, dev_labels = read_data(dev_dir,
                                              imdb_data,
                                              limit=nr_examples)
        train_labels = np.asarray(train_labels, dtype='int32')
        dev_labels = np.asarray(dev_labels, dtype='int32')
        lstm = train(train_texts,
                     train_labels,
                     dev_texts,
                     dev_labels, {
                         'nr_hidden': nr_hidden,
                         'max_length': max_length,
                         'nr_class': 1
                     }, {
                         'dropout': dropout,
                         'lr': learn_rate
                     }, {},
                     nb_epoch=nb_epoch,
                     batch_size=batch_size)
        weights = lstm.get_weights()
        if model_dir is not None:
            with (model_dir / 'model').open('wb') as file_:
                pickle.dump(weights[1:], file_)
            with (model_dir / 'config.json').open('w') as file_:
                file_.write(lstm.to_json())
예제 #3
0
def main(
    model_dir=None,
    train_dir=None,
    dev_dir=None,
    is_runtime=False,
    nr_hidden=64,
    max_length=100,  # Shape
    dropout=0.5,
    learn_rate=0.001,  # General NN config
    nb_epoch=5,
    batch_size=256,
    nr_examples=-1,
):  # Training params
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    if train_dir is None or dev_dir is None:
        imdb_data = thinc.extra.datasets.imdb()
    if is_runtime:
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
        else:
            dev_texts, dev_labels = read_data(dev_dir)
        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
        print(acc)
    else:
        if train_dir is None:
            train_texts, train_labels = zip(*imdb_data[0])
        else:
            print("Read data")
            train_texts, train_labels = read_data(train_dir, limit=nr_examples)
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
        else:
            dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
        train_labels = numpy.asarray(train_labels, dtype="int32")
        dev_labels = numpy.asarray(dev_labels, dtype="int32")
        lstm = train(
            train_texts,
            train_labels,
            dev_texts,
            dev_labels,
            {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
            {"dropout": dropout, "lr": learn_rate},
            {},
            nb_epoch=nb_epoch,
            batch_size=batch_size,
        )
        weights = lstm.get_weights()
        if model_dir is not None:
            with (model_dir / "model").open("wb") as file_:
                pickle.dump(weights[1:], file_)
            with (model_dir / "config.json").open("w") as file_:
                file_.write(lstm.to_json())
예제 #4
0
def main(
    model_dir='model_lstm',
    is_runtime=False,
    nr_hidden=64,
    max_length=100,  # Shape
    dropout=0.5,
    learn_rate=0.001,  # General NN config
    nb_epoch=1,
    batch_size=256,
    nr_examples=-1,
):  # Training params
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    # if train_dir is None or dev_dir is None:
    #     imdb_data = thinc.extra.datasets.imdb()
    # if is_runtime:
    #     if dev_dir is None:
    #         dev_texts, dev_labels = zip(*imdb_data[1])
    #     else:
    #         dev_texts, dev_labels = read_data(dev_dir)
    #     acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
    #     print(acc)
    # else:
    #     if train_dir is None:
    #         train_texts, train_labels = zip(*imdb_data[0])
    #     else:
    #         print("Read data")
    #         train_texts, train_labels = read_data(train_dir, limit=nr_examples)
    #     if dev_dir is None:
    #         dev_texts, dev_labels = zip(*imdb_data[1])
    #     else:
    #         dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
        # print("Loading Train and Test Data.....")
        # with open('dev.txt','rb') as f:
        #     dev_docs=pickle.load(f)
        # with open('train.txt','rb') as f:
        #     train_docs=pickle.load(f)
        # print('Loading Complete..')
        # train_labels = numpy.asarray(train_labels, dtype="int32")
        # dev_labels = numpy.asarray(dev_labels, dtype="int32")
        lstm = train(
            {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
            {"dropout": dropout, "lr": learn_rate},
            {},
            nb_epoch=nb_epoch,
            batch_size=batch_size,
        )
        weights = lstm.get_weights()
        if model_dir is not None:
            with (model_dir / "model").open("wb") as file_:
                pickle.dump(weights[1:], file_)
            with (model_dir / "config.json").open("w") as file_:
                file_.write(lstm.to_json())
예제 #5
0
def main():
    model_dir = pathlib.Path("./model/")
    nr_hidden = 64
    max_length = 100  # Shape
    dropout = 0.5
    learn_rate = 0.02
    nb_epoch = 10
    batch_size = 256
    nr_class = 4

    print("Read data")
    sentences, labels = load_corpus()
    sentences = remove_stopwords(sentences)
    size = len(sentences)
    train_texts, train_labels = sentences[:int(size * 0.6)], labels[:int(size *
                                                                         0.6)]
    dev_texts, dev_labels = sentences[int(size * 0.6):int(size * 0.8)], labels[
        int(size * 0.6):int(size * 0.8)]
    test_texts, test_labels = sentences[int(size * 0.8):], labels[int(size *
                                                                      0.8):]

    train_labels = numpy.asarray(train_labels, dtype='int32')
    dev_labels = numpy.asarray(dev_labels, dtype='int32')
    lstm = train(train_texts,
                 train_labels,
                 dev_texts,
                 dev_labels, {
                     'nr_hidden': nr_hidden,
                     'max_length': max_length,
                     'nr_class': nr_class
                 }, {
                     'dropout': dropout,
                     'lr': learn_rate
                 }, {},
                 nb_epoch=nb_epoch,
                 batch_size=batch_size)
    print("Model has been trained!")

    weights = lstm.get_weights()
    if model_dir is not None:
        with (model_dir / 'model').open('wb') as file_:
            pickle.dump(weights[1:], file_)
        with (model_dir / 'config.json').open('w') as file_:
            file_.write(lstm.to_json())

    nb_correct = evaluate(model_dir, test_texts, test_labels)
    print("Percent of correct prediction: " + str(nb_correct))
def main(
        model_dir=None,
        train_dir='test_plag_data.txt',
        dev_dir='dev_plag_data.txt',
        is_runtime=False,
        nr_hidden=64,
        max_length=100,  # Shape
        dropout=0.5,
        learn_rate=0.001,  # General NN config
        nb_epoch=5,
        batch_size=100,
        nr_examples=-1):  # Training params
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    data = read_data(train_dir)
    if is_runtime:
        dev_texts, dev_labels = read_data(dev_dir)
        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
        print(acc)
    else:
        print("Read data")
        train_labels, train_texts, train_texts2 = read_data(train_dir,
                                                            limit=nr_examples)
        dev_labels, dev_texts, dev_texts2 = read_data(dev_dir,
                                                      limit=nr_examples)
        train_labels = numpy.asarray(train_labels, dtype='int32')
        dev_labels = numpy.asarray(dev_labels, dtype='int32')
        lstm = train(train_texts,
                     train_texts2,
                     train_labels,
                     dev_texts,
                     dev_labels, {
                         'nr_hidden': nr_hidden,
                         'max_length': max_length,
                         'nr_class': 1
                     }, {
                         'dropout': dropout,
                         'lr': learn_rate
                     }, {},
                     nb_epoch=nb_epoch,
                     batch_size=batch_size)
        weights = lstm.get_weights()
        if model_dir is not None:
            with (model_dir / 'model').open('wb') as file_:
                pickle.dump(weights[1:], file_)
            with (model_dir / 'config.json').open('wb') as file_:
                file_.write(lstm.to_json())
예제 #7
0
def main(
    model_dir='/Users/masha/Data/Model',
    train_dir='/Users/masha/Data/Train',
    nr_hidden=128,
    max_length=100,
    dropout=0.2,
    learn_rate=0.0001,
    nb_epoch=150,
    batch_size=64,
    #nr_examples=-1,
    training_portion=.8,
):  # Training params
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    if train_dir is None:
        print('Please provide training directory!')
    train_texts, train_labels, val_texts, val_labels = read_data(
        train_dir, training_portion)

    model = train_model(train_texts,
                        train_labels,
                        val_texts,
                        val_labels, {
                            "nr_hidden": nr_hidden,
                            "max_length": max_length,
                            "nr_class": 1
                        }, {
                            "dropout": dropout,
                            "lr": learn_rate
                        }, {},
                        nb_epoch=nb_epoch,
                        batch_size=batch_size)

    weights = model.get_weights()
    if model_dir is not None:
        with (model_dir / "model").open("wb") as file_:
            pickle.dump(weights[1:], file_)
        with (model_dir / "config.json").open("w") as file_:
            file_.write(model.to_json())
예제 #8
0
def train(
        model_dir,
        train_dir,
        dev_dir,  # fs locations
        model_type='lstm',
        feature_shapes=None,  # neural network type(s): overall or defined per feature via shapes
        nr_examples=-1,
        max_entries=-1,  # restrict data to a subset
        image_embedding_function=None,  # image data, e.g. enable by providing a function like 'vgg16.VGG16'
        dropout=0.5,
        learn_rate=0.001,
        setting=None,  # General NN config (via individual parameters or setting dict)
        nb_epoch=100,
        batch_size=100,
        early_stopping_window=5,  # Training params
        nb_threads=1,
        nb_threads_parse=10  # performance: resource restrictions
):
    global cache

    if nb_threads > 0:
        # restrict number of tensorflow threads
        session_conf = tf.ConfigProto(intra_op_parallelism_threads=nb_threads,
                                      inter_op_parallelism_threads=nb_threads)
        backend.set_session(backend.tf.Session(config=session_conf))

    assert dev_dir is not None, 'dev_dir is not set'
    dev_dir = pathlib.Path(dev_dir)
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        model_dir.mkdir(parents=True, exist_ok=True)

        logger_fh = logging.FileHandler((model_dir / 'log.txt'))
        logger_fh.setLevel(logging.DEBUG)
        logger_fh.setFormatter(
            logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
        logger.addHandler(logger_fh)
    else:
        logger_fh = None

    assert train_dir is not None, 'train_dir is not set'
    train_dir = pathlib.Path(train_dir)
    # some defaults...
    if feature_shapes is None or feature_shapes == '':
        lstm_shapes = {
            'targetParagraphs': {
                'model': create_lstm.__name__,
                'max_length': 500,
                'nr_hidden': 64
            },
            'postText': {
                'model': create_lstm.__name__,
                'max_length': 50,
                'nr_hidden': 30
            },
            'targetTitle': {
                'model': create_lstm.__name__,
                'max_length': 50,
                'nr_hidden': 30
            },
            'targetKeywords': {
                'model': create_lstm.__name__,
                'max_length': 100,
                'nr_hidden': 30
            },
            'targetDescription': {
                'model': create_lstm.__name__,
                'max_length': 100,
                'nr_hidden': 30
            },
        }
        #lstm_shapes = {
        #    'postText,targetTitle,targetDescription,targetParagraphs,targetKeywords':
        #        {'model': create_lstm.__name__, 'max_length': 500, 'nr_hidden': 128},
        #    # 'postMedia': {'model': create_cnn_image.__name__, 'input_shape': None}
        #}

        # max_length, filter_length, nb_filter
        cnn_shapes = {
            'targetParagraphs': {
                'model': create_cnn.__name__,
                'max_length': 500,
                'filter_length': 10,
                'nb_filter': 200
            },
            'postText': {
                'model': create_cnn.__name__,
                'max_length': 50,
                'filter_length': 3,
                'nb_filter': 50
            },
            'targetTitle': {
                'model': create_cnn.__name__,
                'max_length': 50,
                'filter_length': 2,
                'nb_filter': 50
            },
            'targetKeywords': {
                'model': create_cnn.__name__,
                'max_length': 100,
                'filter_length': 1,
                'nb_filter': 50
            },
            'targetDescription': {
                'model': create_cnn.__name__,
                'max_length': 100,
                'filter_length': 5,
                'nb_filter': 50
            },
        }

        if model_type == 'lstm':
            logger.info('use lstm model')
            feature_shapes = lstm_shapes
        elif model_type == 'cnn':
            logger.info('use cnn model')
            feature_shapes = cnn_shapes
        #elif model_type == 'cnn2':
        #    logger.info('use cnn2 model')
        #    shapes = cnn_shapes
        #elif model_type == 'lstm_stacked':
        #    logger.info('use lstm_stacked model')
        #    shapes = lstm_shapes
        else:
            raise ValueError('unknown model_type=%s. use one of: %s' %
                             (model_type, ' '.join(
                                 ['lstm', 'cnn', 'cnn2', 'lstm_stacked'])))
    else:
        feature_shapes = json.loads(feature_shapes)

    logger.info("Read data")
    train_records, _ = read_data(train_dir,
                                 limit=nr_examples,
                                 dont_shuffle=True)
    dev_records, _ = read_data(dev_dir, limit=nr_examples, dont_shuffle=True)

    if 'nlp' not in cache:
        cache['nlp'] = get_nlp()
    #nlp = get_nlp()

    use_images = image_embedding_function is not None and image_embedding_function.strip(
    ) != ''
    if use_images:
        logger.debug('use image data')
    else:
        del feature_shapes[IMAGE_KEY]

    cache['train_X_and_labels'] = cache.get('train_X_and_labels', {})
    preprocessing_cache_key = json.dumps(
        (feature_shapes, max_entries, image_embedding_function, str(train_dir),
         str(dev_dir)),
        sort_keys=True)
    if preprocessing_cache_key not in cache['train_X_and_labels']:
        cache['train_X_and_labels'][
            preprocessing_cache_key] = records_to_features(
                records=train_records,
                nlp=cache['nlp'],
                shapes=feature_shapes,
                nb_threads_parse=nb_threads_parse,
                max_entries=max_entries,
                key_image=IMAGE_KEY,
                data_dir=train_dir,
                image_model_function_name=image_embedding_function)
    train_X, train_labels = cache['train_X_and_labels'][
        preprocessing_cache_key]

    cache['dev_X_and_labels'] = cache.get('dev_X_and_labels', {})
    if preprocessing_cache_key not in cache['dev_X_and_labels']:
        cache['dev_X_and_labels'][
            preprocessing_cache_key] = records_to_features(
                records=dev_records,
                nlp=cache['nlp'],
                shapes=feature_shapes,
                nb_threads_parse=nb_threads_parse,
                max_entries=max_entries,
                key_image=IMAGE_KEY,
                data_dir=dev_dir,
                image_model_function_name=image_embedding_function)
    dev_X, dev_labels = cache['dev_X_and_labels'][preprocessing_cache_key]

    #train_X, train_labels = records_to_features(records=train_records, nlp=cache['nlp'], shapes=feature_shapes,
    #                                            nb_threads_parse=nb_threads_parse, max_entries=max_entries,
    #                                            key_image=key_image, data_dir=train_dir,
    #                                            image_model_function_name=image_embedding_function)
    #dev_X, dev_labels = records_to_features(records=dev_records, nlp=cache['nlp'], shapes=feature_shapes,
    #                                        nb_threads_parse=nb_threads_parse, max_entries=max_entries,
    #                                        key_image=key_image, data_dir=dev_dir,
    #                                        image_model_function_name=image_embedding_function)

    if setting is None or setting == '':
        # default setting
        setting = {'final_layers': [512]}
    else:
        setting = json.loads(setting)

    # set dropout and learning rate if not already in setting
    setting['dropout'] = setting.get('dropout', None) or dropout
    setting['learn_rate'] = setting.get('learn_rate', None) or learn_rate

    # set image data settings if not given
    if use_images:
        if IMAGE_KEY not in feature_shapes:
            feature_shapes[IMAGE_KEY] = {
                'model': create_cnn_image.__name__,
                'layers': [128]
            }
        feature_shapes[IMAGE_KEY]['input_shape'] = train_X[IMAGE_KEY].shape[1:]
        # add "image available" flag
        feature_shapes[IMAGE_FLAG_KEY] = {
            'model': create_identity.__name__,
            'input_shape': train_X[IMAGE_FLAG_KEY].shape[1:]
        }

    logger.info('use setting: %s' % json.dumps(setting).replace(' ', ''))
    logger.info('use feature_shapes: %s' %
                json.dumps(feature_shapes).replace(' ', ''))

    model = create_model(embedding_weights=get_embeddings(cache['nlp'].vocab),
                         feature_shapes=feature_shapes,
                         setting=setting)

    metric = 'val_mean_squared_error'
    metric_best_func = min
    early_stopping_callback = EarlyStopping(monitor=metric,
                                            min_delta=1e-4,
                                            patience=early_stopping_window,
                                            verbose=1)

    callbacks = [early_stopping_callback]
    if model_dir is not None:
        callbacks.append(
            ModelCheckpoint(filepath=str(model_dir / 'model_weights'),
                            monitor=metric,
                            verbose=0,
                            save_best_only=True,
                            save_weights_only=True,
                            mode='auto',
                            period=1))
        callbacks.append(
            CSVLogger(str(model_dir / "log.tsv"), append=True, separator='\t'))

    history_callback = model.fit(as_list(train_X),
                                 train_labels,
                                 validation_data=(as_list(dev_X), dev_labels),
                                 epochs=nb_epoch,
                                 batch_size=batch_size,
                                 callbacks=callbacks)
    metric_history = history_callback.history[metric]

    if model_dir is not None:
        logger.info('remove embeddings from model...')
        # remove embeddings from saved model (already included in spacy model)
        # reload best weights
        model.load_weights(str(model_dir / 'model_weights'))
        weights = model.get_weights()
        with (model_dir / 'model_weights').open('wb') as file_:
            pickle.dump(weights[1:], file_)
        # save model config
        with (model_dir / 'model_config.json').open('w') as file_:
            config_json = model.to_json()
            config_dict = json.loads(config_json)
            config_dict[
                IMAGE_EMBEDDING_FUNCTION_KEY] = image_embedding_function
            json.dump(config_dict, file_)
            #file_.write(model.to_json())

    if logger_fh is not None:
        logger.removeHandler(logger_fh)
    return metric, \
           metric_best_func(metric_history), \
           early_stopping_callback.stopped_epoch + 1 if early_stopping_callback.stopped_epoch > 0 else nb_epoch
예제 #9
0
def main(
        model_dir=None,
        train_dir=None,
        dev_dir=None,
        is_runtime=False,
        nr_hidden=64,
        max_length=100,  # Shape
        dropout=0.5,
        learn_rate=0.001,  # General NN config
        nb_epoch=5,
        batch_size=100,
        nr_examples=-1):  # Training params

    df = load_document()
    data_set = DataSet.from_np_array(df['review'],
                                     numpy.asarray(df['sentimens'],
                                                   dtype='int32'),
                                     class_names=[1, 2, 3, 4, 5],
                                     shuffle=True)

    #if model_dir is not None:
    #    model_dir = pathlib.Path(model_dir)
    #if train_dir is None or dev_dir is None:
    #    imdb_data = thinc.extra.datasets.imdb()
    #if is_runtime:
    #    if dev_dir is None:
    #        dev_texts, dev_labels = zip(*imdb_data[1])
    #    else:
    #        dev_texts, dev_labels = read_data(dev_dir)
    #    acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
    #    print(acc)
    #else:
    #    if train_dir is None:
    #        train_texts, train_labels = zip(*imdb_data[0])
    #    else:
    #        print("Read data")
    #        train_texts, train_labels = read_data(train_dir, limit=nr_examples)
    #    if dev_dir is None:
    #        dev_texts, dev_labels = zip(*imdb_data[1])
    #    else:
    #        dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
    #    train_labels = numpy.asarray(train_labels, dtype='int32')
    #    dev_labels = numpy.asarray(dev_labels, dtype='int32')

    lstm = train(data_set.x_train,
                 data_set.y_train,
                 data_set.x_val,
                 data_set.y_val, {
                     'nr_hidden': nr_hidden,
                     'max_length': max_length,
                     'nr_class': 1
                 }, {
                     'dropout': dropout,
                     'lr': learn_rate
                 }, {},
                 nb_epoch=nb_epoch,
                 batch_size=batch_size)
    weights = lstm.get_weights()
    if model_dir is not None:
        with (model_dir / 'model').open('wb') as file_:
            pickle.dump(weights[1:], file_)
        with (model_dir / 'config.json').open('wb') as file_:
            file_.write(lstm.to_json())