예제 #1
0
def main(args):
    # global parser, args, dev_triplets, test_triplets, we_wrapper, data_handler, model
    parser = argparse.ArgumentParser(description='Train word2vec model.')
    parser.add_argument('dev_file', help='dev input file')
    parser.add_argument('test_file', help='test input file')
    parser.add_argument('we_file', help='word embeddings normed model file')
    # parser.add_argument('output_folder', help='path to the output folder')
    parser.add_argument(
        'org_we_file',
        help='path to the original we model file - before adjectives clustering'
    )
    parser.add_argument('-s',
                        '--supervised',
                        default=False,
                        action='store_true',
                        help='train and evaluate also the supervised model')
    args = parser.parse_args(args)
    dev_triplets = read_HeiPLAS_data(args.dev_file)
    test_triplets = read_HeiPLAS_data(args.test_file)
    # load pre-trained, normalized word2ec
    we_wrapper = MultiSenseWE(args.org_we_file, args.we_file)
    we_wrapper.set_model()
    data_handler = DataHandler(we_wrapper)
    data_handler.run(dev_triplets, test_triplets)
    if args.supervised:
        model = SupervisedModel(data_handler)
        model.run()
    model = UnsupervisedModel(data_handler)
    model.run()
    logger.info("Done!!!!!")
    def __init__(self,
                 classifier,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.classifier = classifier
        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      gpu)

        self.classifier_file = os.path.join(
            self.graphsage_model._log_dir(),
            self.classifier.__class__.__name__ + ".pkl")

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_walks():
            print("The walks do not exist.")
예제 #3
0
if __name__ == "__main__":

    data = pd.read_csv("../tp2_training_dataset.csv",
                       header=None)  #.to_numpy()
    config = yaml.load(open("./config.yml"), Loader=yaml.FullLoader)

    label = data[0].to_numpy()
    dataset = data.drop(columns=[0]).to_numpy()

    model = UnsupervisedModel(
        dataset,
        dataset.shape[-1],
        config["output"],
        error=0.001,
        # error=config["output"],
        max_epochs=config["max_epochs"],
        lr=float(config["lr"]),
        algorithm=config["algorithm"],
        normalize=True,
        normal_params=(config["normal_params"]["mean"],
                       config["normal_params"]["var"]))

    train = True
    model_name = config["model_name"] + "_" + config["algorithm"]
    for f in glob.glob("*.npy"):
        if model_name + ".npy" == f:
            train = False
            break

    if train or config["force_train"]:
        print(model)
예제 #4
0
    def __init__(self,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs
        self.gpu = gpu

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, "citations",
                                      self.gpu)

        # Prepare the training data
        d_train = DataLoader()
        self.df_train = d_train.training_data_with_abstracts_citations().data

        print("Loading the training embeddings...")
        if not self._load_train_embeddings():
            print("The pretrained embeddings are missing.")
        else:
            print("Loaded.")

        training_ids = list(self.df_train.chapter)
        self.training_embeddings = self.pretrained_embeddings[[
            self.pretrained_embeddings_id_map[id] for id in training_ids
        ]]
        self.sim = Similarities(self.training_embeddings, training_ids)

        print("Loading training graph...")
        if not self._load_training_graph():
            print("The training graph does not exist.")
        else:
            print("Loaded.")

        print("Loading training walks...")
        if not self._load_training_walks():
            print("The walks do not exist.")
        else:
            print("Loaded.")
예제 #5
0
    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for unsupervised GraphSAGE model.')
        parser.add_argument('train_prefix',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument("model_name",
                            choices=[
                                "graphsage_mean", "gcn", "graphsage_seq",
                                "graphsage_maxpool", "graphsage_meanpool"
                            ],
                            help="Model names.")
        parser.add_argument('--model_size',
                            choices=["small", "big"],
                            default="small",
                            help="Can be big or small; model specific def'ns")
        parser.add_argument('--learning_rate',
                            type=float,
                            default=0.00001,
                            help='Initial learning rate.')
        parser.add_argument('--epochs',
                            type=int,
                            default=10,
                            help='Number of epochs to train.')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.0,
                            help='Dropout rate (1 - keep probability).')
        parser.add_argument('--weight_decay',
                            type=float,
                            default=0.0,
                            help='Weight for l2 loss on embedding matrix.')
        parser.add_argument('--max_degree',
                            type=int,
                            default=100,
                            help='Maximum node degree.')
        parser.add_argument('--samples_1',
                            type=int,
                            default=25,
                            help='Number of samples in layer 1.')
        parser.add_argument('--samples_2',
                            type=int,
                            default=10,
                            help='Number of users samples in layer 2.')
        parser.add_argument('--dim_1',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--dim_2',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--random_context',
                            action="store_false",
                            default=True,
                            help='Whether to use random context or direct ' +
                            'edges.')
        parser.add_argument('--neg_sample_size',
                            type=int,
                            default=20,
                            help='Number of negative samples.')
        parser.add_argument('--batch_size',
                            type=int,
                            default=512,
                            help='Minibatch size.')
        parser.add_argument('--identity_dim',
                            type=int,
                            default=0,
                            help='Set to positive value to use identity ' +
                            'embedding features of that dimension.')
        parser.add_argument('--save_embeddings',
                            action="store_false",
                            default=True,
                            help='Whether to save embeddings for all nodes ' +
                            'after training')
        parser.add_argument('--base_log_dir',
                            default='../../../data/processed/graphsage/',
                            help='Base directory for logging and saving ' +
                            'embeddings')
        parser.add_argument('--validate_iter',
                            type=int,
                            default=5000,
                            help='How often to run a validation minibatch.')
        parser.add_argument('--validate_batch_size',
                            type=int,
                            default=256,
                            help='How many nodes per validation sample.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        parser.add_argument('--print_every',
                            type=int,
                            default=50,
                            help='How often to print training info.')
        parser.add_argument('--max_total_steps',
                            type=int,
                            default=10**10,
                            help='Maximum total number of iterations.')
        parser.add_argument('--log_device_placement',
                            action="store_true",
                            default=False,
                            help='Whether to log device placement.')
        args = parser.parse_args()

        print("Starting...")
        print("Loading training data..")
        train_data = load_data(args.train_prefix, load_walks=True)
        print("Done loading training data..\n")
        from unsupervised_model import UnsupervisedModel
        model = UnsupervisedModel(
            args.train_prefix, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)
        model.train(train_data)
        print("Finished.")
    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for GraphSAGE concatenated ' +
            'classifier model evaluation.')
        parser.add_argument(
            "classifier_name",
            choices=["KNN", "MLP", "MultinomialLogisticRegression"],
            help="The name of the classifier.")
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('model_checkpoint_citations',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the citations graph.')
        parser.add_argument('model_checkpoint_authors',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the authors graph.')
        parser.add_argument('train_prefix_citations',
                            help='Name of the object file that stores the ' +
                            'citations training data.')
        parser.add_argument('train_prefix_authors',
                            help='Name of the object file that stores the ' +
                            'authors training data.')
        parser.add_argument('model_name',
                            choices=[
                                "graphsage_mean", "gcn", "graphsage_seq",
                                "graphsage_maxpool", "graphsage_meanpool"
                            ],
                            help="Model names.")
        parser.add_argument('--model_size',
                            choices=["small", "big"],
                            default="small",
                            help="Can be big or small; model specific def'ns")
        parser.add_argument('--learning_rate',
                            type=float,
                            default=0.00001,
                            help='Initial learning rate.')
        parser.add_argument('--epochs',
                            type=int,
                            default=10,
                            help='Number of epochs to train.')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.0,
                            help='Dropout rate (1 - keep probability).')
        parser.add_argument('--weight_decay',
                            type=float,
                            default=0.0,
                            help='Weight for l2 loss on embedding matrix.')
        parser.add_argument('--max_degree',
                            type=int,
                            default=100,
                            help='Maximum node degree.')
        parser.add_argument('--samples_1',
                            type=int,
                            default=25,
                            help='Number of samples in layer 1.')
        parser.add_argument('--samples_2',
                            type=int,
                            default=10,
                            help='Number of users samples in layer 2.')
        parser.add_argument('--dim_1',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--dim_2',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--random_context',
                            action="store_false",
                            default=True,
                            help='Whether to use random context or direct ' +
                            'edges.')
        parser.add_argument('--neg_sample_size',
                            type=int,
                            default=20,
                            help='Number of negative samples.')
        parser.add_argument('--batch_size',
                            type=int,
                            default=512,
                            help='Minibatch size.')
        parser.add_argument('--identity_dim',
                            type=int,
                            default=0,
                            help='Set to positive value to use identity ' +
                            'embedding features of that dimension.')
        parser.add_argument('--save_embeddings',
                            action="store_true",
                            default=False,
                            help='Whether to save embeddings for all nodes ' +
                            'after training')
        parser.add_argument('--base_log_dir',
                            default='../../../data/processed/graphsage/',
                            help='Base directory for logging and saving ' +
                            'embeddings')
        parser.add_argument('--validate_iter',
                            type=int,
                            default=5000,
                            help='How often to run a validation minibatch.')
        parser.add_argument('--validate_batch_size',
                            type=int,
                            default=256,
                            help='How many nodes per validation sample.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        parser.add_argument('--print_every',
                            type=int,
                            default=50,
                            help='How often to print training info.')
        parser.add_argument('--max_total_steps',
                            type=int,
                            default=10**10,
                            help='Maximum total number of iterations.')
        parser.add_argument('--log_device_placement',
                            action="store_true",
                            default=False,
                            help='Whether to log device placement.')
        parser.add_argument('--recs',
                            type=int,
                            default=10,
                            help='Number of recommendations.')
        args = parser.parse_args()

        print("Starting evaluation...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
        print("Using GPU {}.".format(str(args.gpu)))

        from GraphSAGEClassifierConcatEvaluation import GraphSAGEClassifierConcatEvaluation
        evaluation_model = GraphSAGEClassifierConcatEvaluation(
            args.classifier_name, args.embedding_type, args.model_name,
            args.model_size, args.learning_rate, args.gpu, args.recs)

        # Initialize GraphSAGE models
        graphsage_model_citations = UnsupervisedModel(
            args.train_prefix_citations, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)
        graphsage_model_authors = UnsupervisedModel(
            args.train_prefix_authors, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)

        # Train model if needed:
        if not evaluation_model._has_persistent_model():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()
            evaluation_model.train(graphsage_model_citations,
                                   graphsage_model_authors)
            print("Training finished.")
            timer.toc()
        else:
            evaluation_model._load_model_classifier()

        # Load test data
        print("Loading test data...")
        query_test, query_test_authors, truth = evaluation_model.load_data()
        print("Loaded.")

        # Infer embeddings
        print("Inferring embeddings for citations graph.")
        queue_citations = mp.Queue()
        process_citations = mp.Process(
            target=evaluation_model.infer_embeddings,
            args=(query_test, None, "citations", graphsage_model_citations,
                  args.model_checkpoint_citations, queue_citations))
        process_citations.start()
        embeddings_citations = queue_citations.get()
        process_citations.join()
        process_citations.terminate()

        print("Inferring embeddings for authors graphs.")
        queue_authors = mp.Queue()
        process_authors = mp.Process(target=evaluation_model.infer_embeddings,
                                     args=(query_test, query_test_authors,
                                           "authors", graphsage_model_authors,
                                           args.model_checkpoint_authors,
                                           queue_authors))
        process_authors.start()
        embeddings_authors = queue_authors.get()
        process_authors.join()
        process_authors.terminate()

        # Concatenate embeddings
        test_embeddings = np.concatenate(
            (embeddings_citations, embeddings_authors), axis=1)

        print("Computing predictions...")
        recommendation = evaluation_model.compute_predictions(test_embeddings)
        print("Predictions computed.")

        # Evaluate
        print("Evaluating...")
        evaluation = EvaluationContainer()
        evaluation.evaluate(recommendation, truth)
        print("Finished.")