def train(train_dir, test_dir=None, nn='berger_cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, verbose=1): # Figure out whether we're predicting categories or keywords if NO_OF_LABELS == 14: scaler_path = CATEGORY_SCALER w2v_path = CATEGORY_WORD2VEC else: scaler_path = KEYWORD_SCALER w2v_path = KEYWORD_WORD2VEC model = MagpieModel( word2vec_model=Word2Vec.load(w2v_path), scaler=load_from_disk(scaler_path), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(NO_OF_LABELS), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history) return history, model
def build_model_for_corpus(corpus): """ Build an appropriate Keras NN model depending on the corpus """ if corpus == 'keywords': keras_model = cnn(embedding_size=100, output_length=10000) elif corpus == 'categories': keras_model = cnn(embedding_size=100, output_length=14) elif corpus == 'experiments': keras_model = cnn(embedding_size=100, output_length=500) else: raise ValueError('The corpus is not valid') model_path = os.path.join(DATA_DIR, corpus, 'model.pickle') keras_model.load_weights(model_path) w2v_model = Word2Vec.load(WORD2VEC_PATH) scaler = load_from_disk(SCALER_PATH) labels = get_labels(keras_model.output_shape[1]) model = MagpieModel( keras_model=keras_model, word2vec_model=w2v_model, scaler=scaler, labels=labels, ) return model
def build_model_for_corpus(corpus): """ Build an appropriate Keras NN model depending on the corpus """ if corpus == 'keywords': keras_model = cnn(embedding_size=100, output_length=10000) elif corpus == 'categories': keras_model = cnn(embedding_size=100, output_length=14) elif corpus == 'experiments': keras_model = cnn(embedding_size=100, output_length=500) else: raise ValueError('The corpus is not valid') model_path = os.path.join(DATA_DIR, corpus, 'model.pickle') keras_model.load_weights(model_path) w2v_model = Word2Vec.load(WORD2VEC_PATH) scaler = load_from_disk(SCALER_PATH) labels = get_labels(keras_model.output_shape[1]) model = MagpieModel( keras_model=keras_model, word2vec_model=w2v_model, scaler=scaler, labels=labels, ) return model
def train(train_dir, test_dir=None, nn='cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, persist=False, no_of_labels=NO_OF_LABELS, verbose=1): model = MagpieModel( word2vec_model=Word2Vec.load(WORD2VEC_PATH), scaler=load_from_disk(SCALER_PATH), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(no_of_labels), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history, model.keras_model, persist=persist) return history, model
def build_model_for_corpus(corpus): """ Build an appropriate Keras NN model depending on the corpus """ keras_model = None no_of_labels = -1 if corpus == 'keywords': keras_model = berger_cnn(embedding_size=100, output_length=1000) no_of_labels = 1000 elif corpus == 'categories': keras_model = berger_cnn(embedding_size=50, output_length=14) no_of_labels = 14 model_path = os.path.join(DATA_DIR, corpus, 'model.pickle') keras_model.load_weights(model_path) w2v_path = os.path.join(DATA_DIR, corpus, 'word2vec.pickle') w2v_model = Word2Vec.load(w2v_path) scaler_path = os.path.join(DATA_DIR, corpus, 'scaler.pickle') scaler = load_from_disk(scaler_path) labels = get_labels(no_of_labels) model = MagpieModel( keras_model=keras_model, word2vec_model=w2v_model, scaler=scaler, labels=labels, ) return model
def train(train_dir, test_dir=None, nn='cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, persist=False, no_of_labels=NO_OF_LABELS, verbose=1): model = MagpieModel( word2vec_model=Word2Vec.load(WORD2VEC_PATH), scaler=load_from_disk(SCALER_PATH), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(no_of_labels), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history, model.keras_model, persist=persist) return history, model
def test( testset_path, ontology=ONTOLOGY_PATH, model=MODEL_PATH, recreate_ontology=False, verbose=True, ): """ Test the trained model on a set under a given path. :param testset_path: path to the directory with the test set :param ontology: path to the ontology :param model: path where the model is pickled :param recreate_ontology: boolean flag whether to recreate the ontology :param verbose: whether to print computation times :return tuple of three floats (precision, recall, f1_score) """ if type(model) in [str, unicode]: model = load_from_disk(model) if type(ontology) in [str, unicode]: ontology = get_ontology(path=ontology, recreate=recreate_ontology) keywords = get_keywords() keyword_indices = {kw: i for i, kw in enumerate(keywords)} all_metrics = calculate_basic_metrics([range(5)]).keys() metrics_agg = {m: [] for m in all_metrics} for doc in get_documents(testset_path, as_generator=True): x, answers, kw_vector = build_test_matrices( [doc], model, testset_path, ontology, ) y_true = build_y_true(answers, keyword_indices, doc.doc_id) # Predict ranking = model.scale_and_predict(x.as_matrix()) y_pred = y_true[0][ranking[::-1]] metrics = calculate_basic_metrics([y_pred]) for k, v in metrics.iteritems(): metrics_agg[k].append(v) return {k: np.mean(v) for k, v in metrics_agg.iteritems()}
def get_data_for_model(train_dir, labels, test_dir=None, nn_model=None, as_generator=False, batch_size=BATCH_SIZE, word2vec_model=None, scaler=None): """ Get data in the form of matrices or generators for both train and test sets. :param train_dir: directory with train files :param labels: an iterable of predefined labels (controlled vocabulary) :param test_dir: directory with test files :param nn_model: Keras model of the NN :param as_generator: flag whether to return a generator or in-memory matrix :param batch_size: integer, size of the batch :param word2vec_model: trained w2v gensim model :param scaler: scaling object for X matrix normalisation e.g. StandardScaler :return: tuple with 2 elements for train and test data. Each element can be either a pair of matrices (X, y) or their generator """ kwargs = dict( label_indices={lab: i for i, lab in enumerate(labels)}, word2vec_model=word2vec_model or Word2Vec.load(WORD2VEC_MODELPATH), scaler=scaler or load_from_disk(SCALER_PATH), nn_model=nn_model, ) if as_generator: filename_it = FilenameIterator(train_dir, batch_size) train_data = iterate_over_batches(filename_it, **kwargs) else: train_files = {filename[:-4] for filename in os.listdir(train_dir)} train_data = build_x_and_y(train_files, train_dir, **kwargs) test_data = None if test_dir: test_files = {filename[:-4] for filename in os.listdir(test_dir)} test_data = build_x_and_y(test_files, test_dir, **kwargs) return train_data, test_data
def batch_train(train_dir, test_dir=None, nn='berger_cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, verbose=1): # Figure out whether we're predicting categories or keywords if NO_OF_LABELS == 14: scaler_path = CATEGORY_SCALER w2v_path = CATEGORY_WORD2VEC else: scaler_path = KEYWORD_SCALER w2v_path = KEYWORD_WORD2VEC model = MagpieModel( word2vec_model=Word2Vec.load(w2v_path), scaler=load_from_disk(scaler_path), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.batch_train( train_dir, get_labels(NO_OF_LABELS), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history) return history, model
def get_data_for_model(train_dir, labels, test_dir=None, nn_model=None, as_generator=False, batch_size=BATCH_SIZE, word2vec_model=None, scaler=None): """ Get data in the form of matrices or generators for both train and test sets. :param train_dir: directory with train files :param labels: an iterable of predefined labels (controlled vocabulary) :param test_dir: directory with test files :param nn_model: Keras model of the NN :param as_generator: flag whether to return a generator or in-memory matrix :param batch_size: integer, size of the batch :param word2vec_model: trained w2v gensim model :param scaler: scaling object for X matrix normalisation e.g. StandardScaler :return: tuple with 2 elements for train and test data. Each element can be either a pair of matrices (X, y) or their generator """ kwargs = dict( label_indices={lab: i for i, lab in enumerate(labels)}, word2vec_model=word2vec_model or Word2Vec.load(WORD2VEC_MODELPATH), scaler=scaler or load_from_disk(SCALER_PATH), nn_model=nn_model, ) if as_generator: filename_it = FilenameIterator(train_dir, batch_size) train_data = iterate_over_batches(filename_it, **kwargs) else: train_files = {filename[:-4] for filename in os.listdir(train_dir)} train_data = build_x_and_y(train_files, train_dir, **kwargs) test_data = None if test_dir: test_files = {filename[:-4] for filename in os.listdir(test_dir)} test_data = build_x_and_y(test_files, test_dir, **kwargs) return train_data, test_data
def load_word2vec_model(self, filepath): """ Load the word2vec model from a file """ self.word2vec_model = load_from_disk(filepath)
def load_scaler(self, filepath): """ Load the scaler object from a file """ self.scaler = load_from_disk(filepath)
def load_word2vec_model(self, filepath): """ Load the word2vec model from a file """ self.word2vec_model = load_from_disk(filepath)
def load_scaler(self, filepath): """ Load the scaler object from a file """ self.scaler = load_from_disk(filepath)