def test_benchmark_performance(self): start = time.time() path_save_sentences = os.path.join(RelationsData.FOLDER, "test.txt") big_dataloader = RelationsDataLoader(self.path_big_csv, min_like=1) preparing_samples(big_dataloader, 0.5, 2, 80, 1, 10, path_save_sentences) # Delete file when done if os.path.exists(path_save_sentences): os.remove(path_save_sentences) logging.info(f"{(time.time() - start):.2f} seconds elapsed")
def optimize(path_sentences: str, like_nodes: List[str], mode: str, path_save: str, epochs: int = 10, context_size: int = 10, dim_features: int = 128, path_model: str = None): """ :param path_sentences: Input of .txt file to sentences (one sentence per line) :param epochs: number of epochs to run model :param path_save: where to save the embeddings :param like_nodes: List of all like/item ids :param context_size: Also called window size :param dim_features: :param mode: {'train' or 'resume'} resume to resume training :param path_model: path model if we are resuming training :return: """ cores = multiprocessing.cpu_count() # save model each epoch # epoch_logger = EpochSaver('word2vec') n_negative_samples = 10 # minimum term frequency (to define the vocabulary) min_count = 2 # a memory-friendly iterator sentences = MySentences(path_sentences) if mode in [TRAIN, ALL]: logging.info('Starting Training of Word2Vec Model') model = gensim.models.Word2Vec(sentences, min_count=min_count, sg=1, size=dim_features, iter=epochs, workers=cores, negative=n_negative_samples, window=context_size) elif mode == RESUME: logging.info('Resuming Training of Word2Vec Model') model = gensim.models.Word2Vec.load(path_model) # Start at the learning rate that we previously stopped model.train(sentences, total_examples=model.corpus_count, epochs=epochs, start_alpha=model.min_alpha_yet_reached) else: raise ValueError('Specify valid value for mode (%s)' % mode) write_embeddings_to_file(model, like_nodes, path_save)
def write_embeddings_to_file(model: gensim.models.Word2Vec, like_nodes: List[str], path_save: str) -> None: logging.info('Writting embeddings to file %s' % path_save) embeddings = {} for v in list(model.wv.vocab): # we only keep likes' nodes embeddings if v in like_nodes: vec = model.wv.__getitem__(v) embeddings[str(v)] = vec with open(path_save, "wb") as f_out: pickle.dump(embeddings, f_out)
def create_labels(label_csv): label_df = pd.read_csv(label_csv, header=None, names=[NODE, GROUP]) num_groups = len(label_df[GROUP].unique()) num_nodes = len(label_df[NODE].unique()) logging.info("Create Label Matrix in Numpy: %s nodes, %s groups" % (num_nodes, num_groups)) labels = np.zeros((num_nodes, num_groups)) for index, row in label_df.iterrows(): node = row[NODE] group = row[GROUP] labels[node - 1, group - 1] = 1 logging.info("Label Matrix Created.") return labels
def preparing_samples(dataloader: DataLoader, p: float, q: float, walk_length: int, walks_per_node: int, context_size: int, path_save_sentences: str): logging.info("Precomputing transition probabilities...") matrix_prob, list_nodes = dataloader.get_transition_probabilites(p, q) if context_size >= walk_length: raise ValueError( "Context size can't be greater or equal to walk length !") logging.info("Sampling walks to create our dataset") sample_walks(path_save_sentences, matrix_prob, list_nodes, walks_per_node, walk_length) return dataloader.list_like_nodes()
def create_features(features_pkl): # tune features into numpy matrix with open(features_pkl, 'rb') as f: n2v_dic = pickle.load(f) nodes = n2v_dic.keys() num_nodes, num_features = len(nodes), len(n2v_dic['1']) logging.info("Create Feature Matrix in Numpy: %s nodes, %s features" % (num_nodes, num_features)) features = np.zeros((num_nodes, num_features)) for node in nodes: idx = int(node) - 1 features[idx] = n2v_dic[node] logging.info("Feature Matrix Created.") return features
def main(): args = parse() if args.type.lower() == "relation" or args.type.lower() == "relations": dataloader = RelationsDataLoader(RelationsData.CSV_FILE, min_like=args.min_like) folder = RelationsData.FOLDER elif args.type.lower() == "blogcatalog": dataloader = BlogCatalogDataLoader(BlogCatalogData.EDGE_CSV, min_like=args.min_like) folder = BlogCatalogData.FOLDER else: raise NotImplementedError("Other datatypes are not yet impleented") if args.save is None: args.save = folder str_save = f"_p_{args.p}_q_{args.q}_minLike_{args.min_like}" file_sampled_walks = "sampled_walks" + str_save + ".txt" # Save sample sentences (random walks) to a .txt file to be memory efficient path_sentences = os.path.join(args.save, file_sampled_walks) # add number of epochs for name file of embeddings str_save += f"_dim_{args.dim_features}_window_{args.context_size}_epochs_{args.epochs}" file_embeddings = "features_node2vec" + str_save + ".pkl" args.save = os.path.join(args.save, file_embeddings) if args.mode in [PREPROCESS, ALL]: like_nodes = preparing_samples(dataloader, args.p, args.q, args.walk_length, args.walks_per_node, args.context_size, path_sentences) else: like_nodes = dataloader.list_like_nodes() if args.mode in [ALL, TRAIN, RESUME]: logging.info("Starting training of skip-gram model") optimize(path_sentences, like_nodes, args.mode, args.save, args.epochs, args.context_size, args.dim_features)
def k_fold_average(features, labels, clf, k=10): test_result = [] kf = KFold(n_splits=k, shuffle=True) for train_index, test_index in kf.split(features): x_train, x_test = features[train_index], features[test_index] y_train, y_test = labels[train_index], labels[test_index] clf = clf.fit(x_train, y_train) # code (from https://github.com/apoorvavinod/node2vec/blob/master/src/Classifier.py) y_test_ = sparse2array_inv_binarize(y_test) num_predictions = [len(item) for item in y_test_] probabilities = clf.predict_proba(x_test) sorted_indices_probs = probabilities.argsort() y_pred = [ sorted_indices[-num:].tolist() for (sorted_indices, num) in zip(sorted_indices_probs, num_predictions) ] mi, ma = compute_metrics(y_test, y_pred) logging.info("Macro F1: %s" % ma) logging.info("Micro F1: %s" % mi) test_result.append(ma) return sum(test_result) / len(test_result)
def main(): parser = argparse.ArgumentParser(description='node2vec_blogcatalog') parser.add_argument( '--path', type=str, default=BlogCatalogData.FEATURES_FILE, help='path of trained BlogCatalog dataset node2vec feature') parser.add_argument('--label', type=str, default=BlogCatalogData.LABELS_FILE, help='path of label file') parser.add_argument('--k', type=int, default=10, help='number of fold validation') args = parser.parse_known_args()[0] features = create_features(args.path) labels = create_labels(args.label) clf = OneVsRestClassifier( LogisticRegression(multi_class='ovr', solver='lbfgs')) kfold_avg = k_fold_average(features, labels, clf, k=args.k) logging.info("%s Fold CV Macro F1 Score: %s " % (args.k, kfold_avg))
from flask import Flask from flask_migrate import Migrate, MigrateCommand from waitress import serve from src.app import app, db from src.config import logging migrate = Migrate(app, db) if __name__ == '__main__': logging.info("Starting server...") serve(app, host="0.0.0.0", port=8000)