示例#1
0
    def test_benchmark_performance(self):
        start = time.time()
        path_save_sentences = os.path.join(RelationsData.FOLDER, "test.txt")
        big_dataloader = RelationsDataLoader(self.path_big_csv, min_like=1)
        preparing_samples(big_dataloader, 0.5, 2, 80, 1, 10,
                          path_save_sentences)
        # Delete file when done
        if os.path.exists(path_save_sentences):
            os.remove(path_save_sentences)

        logging.info(f"{(time.time() - start):.2f} seconds elapsed")
示例#2
0
def optimize(path_sentences: str,
             like_nodes: List[str],
             mode: str,
             path_save: str,
             epochs: int = 10,
             context_size: int = 10,
             dim_features: int = 128,
             path_model: str = None):
    """
    :param path_sentences: Input of .txt file to sentences (one sentence per line)
    :param epochs: number of epochs to run model
    :param path_save: where to save the embeddings
    :param like_nodes: List of all like/item ids
    :param context_size: Also called window size
    :param dim_features:
    :param mode: {'train' or 'resume'} resume to resume training
    :param path_model: path model if we are resuming training
    :return:
    """
    cores = multiprocessing.cpu_count()

    # save model each epoch
    # epoch_logger = EpochSaver('word2vec')

    n_negative_samples = 10
    # minimum term frequency (to define the vocabulary)
    min_count = 2

    # a memory-friendly iterator
    sentences = MySentences(path_sentences)

    if mode in [TRAIN, ALL]:
        logging.info('Starting Training of Word2Vec Model')
        model = gensim.models.Word2Vec(sentences,
                                       min_count=min_count,
                                       sg=1,
                                       size=dim_features,
                                       iter=epochs,
                                       workers=cores,
                                       negative=n_negative_samples,
                                       window=context_size)
    elif mode == RESUME:
        logging.info('Resuming Training of Word2Vec Model')
        model = gensim.models.Word2Vec.load(path_model)
        # Start at the learning rate that we previously stopped
        model.train(sentences,
                    total_examples=model.corpus_count,
                    epochs=epochs,
                    start_alpha=model.min_alpha_yet_reached)
    else:
        raise ValueError('Specify valid value for mode (%s)' % mode)

    write_embeddings_to_file(model, like_nodes, path_save)
示例#3
0
def write_embeddings_to_file(model: gensim.models.Word2Vec,
                             like_nodes: List[str], path_save: str) -> None:
    logging.info('Writting embeddings to file %s' % path_save)
    embeddings = {}
    for v in list(model.wv.vocab):
        # we only keep likes' nodes embeddings
        if v in like_nodes:
            vec = model.wv.__getitem__(v)
            embeddings[str(v)] = vec

    with open(path_save, "wb") as f_out:
        pickle.dump(embeddings, f_out)
示例#4
0
def create_labels(label_csv):
    label_df = pd.read_csv(label_csv, header=None, names=[NODE, GROUP])
    num_groups = len(label_df[GROUP].unique())
    num_nodes = len(label_df[NODE].unique())
    logging.info("Create Label Matrix in Numpy: %s nodes, %s groups" %
                 (num_nodes, num_groups))
    labels = np.zeros((num_nodes, num_groups))
    for index, row in label_df.iterrows():
        node = row[NODE]
        group = row[GROUP]
        labels[node - 1, group - 1] = 1
    logging.info("Label Matrix Created.")
    return labels
示例#5
0
def preparing_samples(dataloader: DataLoader, p: float, q: float,
                      walk_length: int, walks_per_node: int, context_size: int,
                      path_save_sentences: str):
    logging.info("Precomputing transition probabilities...")
    matrix_prob, list_nodes = dataloader.get_transition_probabilites(p, q)

    if context_size >= walk_length:
        raise ValueError(
            "Context size can't be greater or equal to walk length !")

    logging.info("Sampling walks to create our dataset")
    sample_walks(path_save_sentences, matrix_prob, list_nodes, walks_per_node,
                 walk_length)
    return dataloader.list_like_nodes()
示例#6
0
def create_features(features_pkl):
    # tune features into numpy matrix
    with open(features_pkl, 'rb') as f:
        n2v_dic = pickle.load(f)
    nodes = n2v_dic.keys()
    num_nodes, num_features = len(nodes), len(n2v_dic['1'])
    logging.info("Create Feature Matrix in Numpy: %s nodes, %s features" %
                 (num_nodes, num_features))
    features = np.zeros((num_nodes, num_features))
    for node in nodes:
        idx = int(node) - 1
        features[idx] = n2v_dic[node]
    logging.info("Feature Matrix Created.")
    return features
示例#7
0
def main():
    args = parse()

    if args.type.lower() == "relation" or args.type.lower() == "relations":
        dataloader = RelationsDataLoader(RelationsData.CSV_FILE,
                                         min_like=args.min_like)
        folder = RelationsData.FOLDER
    elif args.type.lower() == "blogcatalog":
        dataloader = BlogCatalogDataLoader(BlogCatalogData.EDGE_CSV,
                                           min_like=args.min_like)
        folder = BlogCatalogData.FOLDER
    else:
        raise NotImplementedError("Other datatypes are not yet impleented")

    if args.save is None:
        args.save = folder

    str_save = f"_p_{args.p}_q_{args.q}_minLike_{args.min_like}"
    file_sampled_walks = "sampled_walks" + str_save + ".txt"
    # Save sample sentences (random walks) to a .txt file to be memory efficient
    path_sentences = os.path.join(args.save, file_sampled_walks)

    # add number of epochs for name file of embeddings
    str_save += f"_dim_{args.dim_features}_window_{args.context_size}_epochs_{args.epochs}"

    file_embeddings = "features_node2vec" + str_save + ".pkl"

    args.save = os.path.join(args.save, file_embeddings)

    if args.mode in [PREPROCESS, ALL]:
        like_nodes = preparing_samples(dataloader, args.p, args.q,
                                       args.walk_length, args.walks_per_node,
                                       args.context_size, path_sentences)
    else:
        like_nodes = dataloader.list_like_nodes()

    if args.mode in [ALL, TRAIN, RESUME]:
        logging.info("Starting training of skip-gram model")
        optimize(path_sentences, like_nodes, args.mode, args.save, args.epochs,
                 args.context_size, args.dim_features)
示例#8
0
def k_fold_average(features, labels, clf, k=10):
    test_result = []
    kf = KFold(n_splits=k, shuffle=True)
    for train_index, test_index in kf.split(features):
        x_train, x_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf = clf.fit(x_train, y_train)
        # code (from https://github.com/apoorvavinod/node2vec/blob/master/src/Classifier.py)
        y_test_ = sparse2array_inv_binarize(y_test)
        num_predictions = [len(item) for item in y_test_]
        probabilities = clf.predict_proba(x_test)
        sorted_indices_probs = probabilities.argsort()
        y_pred = [
            sorted_indices[-num:].tolist()
            for (sorted_indices,
                 num) in zip(sorted_indices_probs, num_predictions)
        ]
        mi, ma = compute_metrics(y_test, y_pred)
        logging.info("Macro F1: %s" % ma)
        logging.info("Micro F1: %s" % mi)
        test_result.append(ma)
        return sum(test_result) / len(test_result)
示例#9
0
def main():
    parser = argparse.ArgumentParser(description='node2vec_blogcatalog')
    parser.add_argument(
        '--path',
        type=str,
        default=BlogCatalogData.FEATURES_FILE,
        help='path of trained BlogCatalog dataset node2vec feature')
    parser.add_argument('--label',
                        type=str,
                        default=BlogCatalogData.LABELS_FILE,
                        help='path of label file')
    parser.add_argument('--k',
                        type=int,
                        default=10,
                        help='number of fold validation')
    args = parser.parse_known_args()[0]

    features = create_features(args.path)
    labels = create_labels(args.label)
    clf = OneVsRestClassifier(
        LogisticRegression(multi_class='ovr', solver='lbfgs'))
    kfold_avg = k_fold_average(features, labels, clf, k=args.k)
    logging.info("%s Fold CV Macro F1 Score: %s " % (args.k, kfold_avg))
示例#10
0
from flask import Flask
from flask_migrate import Migrate, MigrateCommand
from waitress import serve


from src.app import app, db
from src.config import logging

migrate = Migrate(app, db)

if __name__ == '__main__':
    logging.info("Starting server...")
    serve(app, host="0.0.0.0", port=8000)