def __init__(self, classifier, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.classifier = classifier self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, gpu) self.classifier_file = os.path.join( self.graphsage_model._log_dir(), self.classifier.__class__.__name__ + ".pkl") if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_walks(): print("The walks do not exist.")
def main(args): # global parser, args, dev_triplets, test_triplets, we_wrapper, data_handler, model parser = argparse.ArgumentParser(description='Train word2vec model.') parser.add_argument('dev_file', help='dev input file') parser.add_argument('test_file', help='test input file') parser.add_argument('we_file', help='word embeddings normed model file') # parser.add_argument('output_folder', help='path to the output folder') parser.add_argument( 'org_we_file', help='path to the original we model file - before adjectives clustering' ) parser.add_argument('-s', '--supervised', default=False, action='store_true', help='train and evaluate also the supervised model') args = parser.parse_args(args) dev_triplets = read_HeiPLAS_data(args.dev_file) test_triplets = read_HeiPLAS_data(args.test_file) # load pre-trained, normalized word2ec we_wrapper = MultiSenseWE(args.org_we_file, args.we_file) we_wrapper.set_model() data_handler = DataHandler(we_wrapper) data_handler.run(dev_triplets, test_triplets) if args.supervised: model = SupervisedModel(data_handler) model.run() model = UnsupervisedModel(data_handler) model.run() logger.info("Done!!!!!")
def pretrain(self, train_set, validation_set=None): """Perform Unsupervised pretraining of the autoencoder.""" self.do_pretrain = True def set_params_func(rbmmachine, rbmgraph): params = rbmmachine.get_model_parameters(graph=rbmgraph) self.encoding_w_.append(params['W']) self.encoding_b_.append(params['bh_']) return UnsupervisedModel.pretrain_procedure( self, self.rbms, self.rbm_graphs, set_params_func=set_params_func, train_set=train_set, validation_set=validation_set)
def __init__(self, num_hidden, visible_unit_type='bin', main_dir='rbm/', models_dir='models/', data_dir='data/', summary_dir='logs/', model_name='rbm', dataset='mnist', loss_func='mean_squared', l2reg=5e-4, regtype='none', gibbs_sampling_steps=1, learning_rate=0.01, batch_size=10, num_epochs=10, stddev=0.1, D=[], verbose=0): """Constructor. :param num_hidden: number of hidden units :param loss_function: type of loss function :param visible_unit_type: type of the visible units (bin, gauss or rsm) :param gibbs_sampling_steps: optional, default 1 :param stddev: default 0.1. Ignored if visible_unit_type is not 'gauss' :param D: default []. Optional documents dimensions array. Used only if visible_unit_type is 'rsm' :param verbose: level of verbosity. optional, default 0 """ UnsupervisedModel.__init__(self, model_name, main_dir, models_dir, data_dir, summary_dir) self._initialize_training_parameters(loss_func=loss_func, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, dataset=dataset, regtype=regtype, l2reg=l2reg) self.num_hidden = num_hidden self.visible_unit_type = visible_unit_type self.gibbs_sampling_steps = gibbs_sampling_steps self.stddev = stddev self.D = D self.verbose = verbose self.W = None self.bh_ = None self.bv_ = None self.w_upd8 = None self.bh_upd8 = None self.bv_upd8 = None self.cost = None self.input_data = None self.hrand = None self.vrand = None
if __name__ == "__main__": data = pd.read_csv("../tp2_training_dataset.csv", header=None) #.to_numpy() config = yaml.load(open("./config.yml"), Loader=yaml.FullLoader) label = data[0].to_numpy() dataset = data.drop(columns=[0]).to_numpy() model = UnsupervisedModel( dataset, dataset.shape[-1], config["output"], error=0.001, # error=config["output"], max_epochs=config["max_epochs"], lr=float(config["lr"]), algorithm=config["algorithm"], normalize=True, normal_params=(config["normal_params"]["mean"], config["normal_params"]["var"])) train = True model_name = config["model_name"] + "_" + config["algorithm"] for f in glob.glob("*.npy"): if model_name + ".npy" == f: train = False break if train or config["force_train"]: print(model)
def __init__(self, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs self.gpu = gpu os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, "citations", self.gpu) # Prepare the training data d_train = DataLoader() self.df_train = d_train.training_data_with_abstracts_citations().data print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(self.df_train.chapter) self.training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.sim = Similarities(self.training_embeddings, training_ids) print("Loading training graph...") if not self._load_training_graph(): print("The training graph does not exist.") else: print("Loaded.") print("Loading training walks...") if not self._load_training_walks(): print("The walks do not exist.") else: print("Loaded.")
class GraphSAGENeighbourModel(AbstractModel): def __init__(self, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs self.gpu = gpu os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, "citations", self.gpu) # Prepare the training data d_train = DataLoader() self.df_train = d_train.training_data_with_abstracts_citations().data print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(self.df_train.chapter) self.training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.sim = Similarities(self.training_embeddings, training_ids) print("Loading training graph...") if not self._load_training_graph(): print("The training graph does not exist.") else: print("Loaded.") print("Loading training walks...") if not self._load_training_walks(): print("The walks do not exist.") else: print("Loaded.") def query_single(self, query): """Queries the model and returns a list of recommendations. Args: query (list): The query as needed by the model, is in the form [chapter_title, chapter_abstract, list(chapter_citations)]. Returns list: ids of the conferences double: confidence scores """ if len(query) < 3: raise ValueError("The input does not contain enough data; " + "chapter title chapter abstract, and chapter " + "citations are required.") # Generate an ID for the query query_id = "new_node_id:" + "-".join( [str(i) for i in random.sample(range(0, 10000), 5)]) return self.query_batch([(query_id, query[0], query[1], query[2])]) def query_batch(self, batch): """Queries the model and returns a lis of recommendations. Args: batch (list of ntuples): The list of queries as needed by the model. The ntuples are in the form (chapter_id, chapter_title, chapter_abstract, list(chapter_citations)). Returns list: ids of the conferences double: confidence scores """ df_test = pd.DataFrame(batch, columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) # Preprocess the data graph, features, id_map = self.preprocessor.test_data( df_test, self.G_train) # Infer embeddings test_nodes, test_embeddings = self.graphsage_model.predict( [graph, features, id_map, self.walks], self.model_checkpoint) # Obtain the most similar neighbours similarities = [] with tqdm(desc="Computing similarities", total=len(test_embeddings)) as pbar: for vector in test_embeddings: similarities.append( self.sim.similar_by_vector(vector, topn=self.recs * 10)) pbar.update(1) # Map similar papers to conferences conferenceseries = [] confidences = [] with tqdm(desc="Computing conference predicitons.", total=len(similarities)) as pbar: for similarity in similarities: conferences = set() scores = [] for idx in range(len(similarity)): conferences_length = len(conferences) if conferences_length < self.recs: conferences.add( list(self.df_train[self.df_train.chapter == similarity[idx] [0]].conferenceseries)[0]) if len(conferences) != conferences_length: scores.append(similarity[idx][1]) conferenceseries.append(list(conferences)) confidences.append(scores) pbar.update(1) results = [conferenceseries, confidences] return results def train(self): pass def _load_train_embeddings(self): embeddings_file = os.path.join(self.graphsage_model._log_dir(), "embeddings.npy") embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(), "embeddings_ids.txt") if os.path.isfile(embeddings_file) and os.path.isfile( embeddings_ids_file): self.pretrained_embeddings = np.load(embeddings_file) self.pretrained_embeddings_id_map = {} with open(embeddings_ids_file) as f: for i, line in enumerate(f): self.pretrained_embeddings_id_map[line.strip()] = i return True return False def _load_training_graph(self): graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-G.json") if os.path.isfile(graph_file): with open(graph_file) as f: self.G_train = json_graph.node_link_graph(json.load(f)) return True return False def _load_training_walks(self): walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-walks.txt") self.walks = [] if isinstance(list(self.G_train.nodes)[0], int): conversion = lambda n: int(n) else: conversion = lambda n: n if os.path.isfile(walks_file): with open(walks_file) as f: for line in f: self.walks.append(map(conversion, line.split())) return True return False
class GraphSAGEClassifierModel(AbstractModel): def __init__(self, classifier, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.classifier = classifier self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, gpu) self.classifier_file = os.path.join( self.graphsage_model._log_dir(), self.classifier.__class__.__name__ + ".pkl") if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_walks(): print("The walks do not exist.") def query_single(self, query): """Queries the model and returns a list of recommendations. Args: query (list): The query as needed by the model, is in the form [chapter_title, chapter_abstract, list(chapter_citations)]. Returns list: ids of the conferences double: confidence scores """ # Generate an ID for the query query_id = "new_node_id:" + "-".join( [str(i) for i in random.sample(range(0, 10000), 5)]) if self.graph_type == "citations": if len(query) < 3: raise ValueError("The input does not contain enough data; " + "chapter title chapter abstract, and " + "chapter citations are required.") return self.query_batch([(query_id, query[0], query[1], query[2])]) elif self.graph_type == "authors": if len(query) < 4: raise ValueError( "The input does not contain enough data; " + "chapter title chapter abstract, chapter " + "citations, and chapter authors are required.") authors_df = pd.DataFrame({ "author_name": query[3], "chapter": [query_id] * len(query[3]) }) return self.query_batch([(query_id, query[0], query[1], query[2])], authors_df) else: raise ValueError("Graph type not recognised.") def query_batch(self, batch): """Queries the model and returns a lis of recommendations. Args: batch (list of ntuples): The list of queries as needed by the model. The ntuples are in the form (chapter_id, chapter_title, chapter_abstract, list(chapter_citations)). Returns list: ids of the conferences double: confidence scores """ if self.graph_type == "citations": df_test = pd.DataFrame(batch, columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) # Preprocess the data graph, features, id_map = self.preprocessor.test_data( df_test, self.G_train) elif self.graph_type == "authors": df_test = pd.DataFrame(batch[0], columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) authors_df = batch[1] # Preprocess the data graph, features, id_map = self.preprocessor.test_data( df_test, self.G_train, authors_df=authors_df) else: raise ValueError("Graph type not recognised.") # Infer embeddings test_embeddings = self.graphsage_model.predict( [graph, features, id_map, self.walks], self.model_checkpoint)[1] # Compute predictions predictions = self.classifier.predict_proba(test_embeddings) sorted_predictions = np.argsort(-np.array(predictions)) conferenceseries = list() confidences = list() for index, order in enumerate(sorted_predictions): conferences = list() scores = list() i = 0 while len(conferences) < self.recs: conf = self.label_encoder.inverse_transform([order[i] ]).tolist()[0] if conf not in conferences: conferences.append(conf) scores.append(predictions[index][order][i]) i += 1 conferenceseries.append(conferences) confidences.append(scores) results = [conferenceseries, confidences] return results def train(self, data): if not self._load_model_classifier(): print("Classifier not trained yet. Training now...") timer = Timer() timer.tic() print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(data.chapter) training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.label_encoder = LabelEncoder() self.labels = self.label_encoder.fit_transform( data.conferenceseries) self.classifier.fit(training_embeddings, self.labels) self._save_model_classifier() print("Training finished.") timer.toc() def _load_training_graph(self): graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-G.json") if os.path.isfile(graph_file): print("Loading training graph...") with open(graph_file) as f: self.G_train = json_graph.node_link_graph(json.load(f)) print("Loaded.") return True return False def _load_training_walks(self): walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-walks.txt") self.walks = [] if isinstance(list(self.G_train.nodes)[0], int): conversion = lambda n: int(n) else: conversion = lambda n: n if os.path.isfile(walks_file): print("Loading training walks...") with open(walks_file) as f: for line in f: self.walks.append(map(conversion, line.split())) print("Loaded.") return True return False def _load_train_embeddings(self): embeddings_file = os.path.join(self.graphsage_model._log_dir(), "embeddings.npy") embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(), "embeddings_ids.txt") if os.path.isfile(embeddings_file) and os.path.isfile( embeddings_ids_file): self.pretrained_embeddings = np.load(embeddings_file) self.pretrained_embeddings_id_map = {} with open(embeddings_ids_file) as f: for i, line in enumerate(f): self.pretrained_embeddings_id_map[line.strip()] = i return True return False def _load_model_classifier(self): if os.path.isfile(self.classifier_file): print("Loading classifier...") with open(self.classifier_file, "rb") as f: self.label_encoder, self.labels, self.classifier = pickle.load( f) print("Loaded.") return True return False def _save_model_classifier(self): with open(self.classifier_file, "wb") as f: pickle.dump([self.label_encoder, self.labels, self.classifier], f, protocol=4) def _has_persistent_model(self): if os.path.isfile(self.classifier_file): return True return False
def __init__(self, layers, model_name='srbm', main_dir='srbm/', models_dir='models/', data_dir='data/', summary_dir='logs/', num_epochs=[10], batch_size=[10], dataset='mnist', learning_rate=[0.01], gibbs_k=[1], loss_func=['mean_squared'], momentum=0.5, finetune_dropout=1, verbose=1, finetune_loss_func='cross_entropy', finetune_enc_act_func=[tf.nn.relu], finetune_dec_act_func=[tf.nn.sigmoid], finetune_opt='gradient_descent', finetune_learning_rate=0.001, l2reg=5e-4, finetune_num_epochs=10, noise=['gauss'], stddev=0.1, finetune_batch_size=20, do_pretrain=False, tied_weights=False, regtype=['none'], finetune_reg_type='none'): """Constructor. :param layers: list containing the hidden units for each layer :param finetune_loss_func: Loss function for the softmax layer. string, default ['cross_entropy', 'mean_squared'] :param finetune_dropout: dropout parameter :param finetune_learning_rate: learning rate for the finetuning. float, default 0.001 :param finetune_enc_act_func: activation function for the encoder finetuning phase :param finetune_dec_act_func: activation function for the decoder finetuning phase :param finetune_opt: optimizer for the finetuning phase :param finetune_num_epochs: Number of epochs for the finetuning. int, default 20 :param finetune_batch_size: Size of each mini-batch for the finetuning. int, default 20 :param verbose: Level of verbosity. 0 - silent, 1 - print accuracy. int, default 0 :param do_pretrain: True: uses variables from pretraining, False: initialize new variables. """ # WARNING! This must be the first expression in the function or else it # will send other variables to expanded_args() # This function takes all the passed parameters that are lists and # expands them to the number of layers, if the number # of layers is more than the list of the parameter. expanded_args = utilities.expand_args(**locals()) UnsupervisedModel.__init__(self, model_name, main_dir, models_dir, data_dir, summary_dir) self._initialize_training_parameters( loss_func=finetune_loss_func, learning_rate=finetune_learning_rate, regtype=finetune_reg_type, num_epochs=finetune_num_epochs, batch_size=finetune_batch_size, l2reg=l2reg, dropout=finetune_dropout, dataset=dataset, opt=finetune_opt, momentum=momentum) self.do_pretrain = do_pretrain self.layers = layers self.tied_weights = tied_weights self.verbose = verbose self.finetune_enc_act_func = expanded_args['finetune_enc_act_func'] self.finetune_dec_act_func = expanded_args['finetune_dec_act_func'] self.input_ref = None # Model parameters self.encoding_w_ = [] # list of matrices of encoding weights per layer self.encoding_b_ = [] # list of arrays of encoding biases per layer self.decoding_w = [] # list of matrices of decoding weights per layer self.decoding_b = [] # list of arrays of decoding biases per layer self.reconstruction = None self.rbms = [] self.rbm_graphs = [] for l, layer in enumerate(layers): rbm_str = 'rbm-' + str(l + 1) new_rbm = rbm.RBM(model_name=self.model_name + '-' + rbm_str, loss_func=expanded_args['loss_func'][l], models_dir=os.path.join(self.models_dir, rbm_str), data_dir=os.path.join(self.data_dir, rbm_str), summary_dir=os.path.join(self.tf_summary_dir, rbm_str), visible_unit_type=expanded_args['noise'][l], stddev=stddev, num_hidden=expanded_args['layers'][l], main_dir=self.main_dir, learning_rate=expanded_args['learning_rate'][l], gibbs_sampling_steps=expanded_args['gibbs_k'][l], num_epochs=expanded_args['num_epochs'][l], batch_size=expanded_args['batch_size'][l], verbose=self.verbose, regtype=expanded_args['regtype'][l]) self.rbms.append(new_rbm) self.rbm_graphs.append(tf.Graph())
def __init__(self, n_components, name='dae', loss_func='mse', enc_act_func=tf.nn.tanh, dec_act_func=None, num_epochs=10, batch_size=10, opt='sgd', learning_rate=0.01, momentum=0.9, corr_type='none', corr_frac=0., regtype='none', regcoef=5e-4): """Constructor. Parameters ---------- n_components : int Number of hidden units. name : str, optional (default = "dae") Model name (used for save/load from disk). loss_func : str, optional (default = "mse") Loss function. ['mse', 'cross_entropy'] enc_act_func : tf.nn.[activation] Activation function for the encoder. dec_act_func : tf.nn.[activation] Activation function for the decoder. num_epochs : int, optional (default = 10) Number of epochs. batch_size : int, optional (default = 10) Size of each mini-batch. opt : str, optional (default = "sgd") Which tensorflow optimizer to use. Possible values: ['sgd', 'momentum', 'adagrad', 'adam'] learning_rate : float, optional (default = 0.01) Initial learning rate. momentum : float, optional (default = 0.9) Momentum parameter (only used if opt = "momentum"). corr_type : str, optional (default = "none") Type of input corruption. Can be one of: ["none", "masking", "salt_and_pepper"] corr_frac : float, optional (default = 0.0) Fraction of the input to corrupt. regtype : str, optional (default = "none") Type of regularization to apply. Can be one of: ["none", "l1", "l2"]. regcoef : float, optional (default = 5e-4) Regularization parameter. If 0, no regularization. Only considered if regtype != "none". """ print("correct dae file") UnsupervisedModel.__init__(self, name) self.loss_func = loss_func self.learning_rate = learning_rate self.opt = opt self.num_epochs = num_epochs self.batch_size = batch_size self.momentum = momentum self.regtype = regtype self.regcoef = regcoef self.loss = Loss(self.loss_func) self.trainer = Trainer(opt, learning_rate=learning_rate, momentum=momentum) self.n_components = n_components self.enc_act_func = enc_act_func self.dec_act_func = dec_act_func self.corr_type = corr_type self.corr_frac = corr_frac self.input_data_orig = None self.input_data = None self.W_ = None self.bh_ = None self.bv_ = None
def main(): parser = argparse.ArgumentParser( description='Arguments for unsupervised GraphSAGE model.') parser.add_argument('train_prefix', help='Name of the object file that stores the ' + 'training data.') parser.add_argument("model_name", choices=[ "graphsage_mean", "gcn", "graphsage_seq", "graphsage_maxpool", "graphsage_meanpool" ], help="Model names.") parser.add_argument('--model_size', choices=["small", "big"], default="small", help="Can be big or small; model specific def'ns") parser.add_argument('--learning_rate', type=float, default=0.00001, help='Initial learning rate.') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train.') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout rate (1 - keep probability).') parser.add_argument('--weight_decay', type=float, default=0.0, help='Weight for l2 loss on embedding matrix.') parser.add_argument('--max_degree', type=int, default=100, help='Maximum node degree.') parser.add_argument('--samples_1', type=int, default=25, help='Number of samples in layer 1.') parser.add_argument('--samples_2', type=int, default=10, help='Number of users samples in layer 2.') parser.add_argument('--dim_1', type=int, default=128, help='Size of output dim ' + '(final is 2x this, if using concat)') parser.add_argument('--dim_2', type=int, default=128, help='Size of output dim ' + '(final is 2x this, if using concat)') parser.add_argument('--random_context', action="store_false", default=True, help='Whether to use random context or direct ' + 'edges.') parser.add_argument('--neg_sample_size', type=int, default=20, help='Number of negative samples.') parser.add_argument('--batch_size', type=int, default=512, help='Minibatch size.') parser.add_argument('--identity_dim', type=int, default=0, help='Set to positive value to use identity ' + 'embedding features of that dimension.') parser.add_argument('--save_embeddings', action="store_false", default=True, help='Whether to save embeddings for all nodes ' + 'after training') parser.add_argument('--base_log_dir', default='../../../data/processed/graphsage/', help='Base directory for logging and saving ' + 'embeddings') parser.add_argument('--validate_iter', type=int, default=5000, help='How often to run a validation minibatch.') parser.add_argument('--validate_batch_size', type=int, default=256, help='How many nodes per validation sample.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') parser.add_argument('--print_every', type=int, default=50, help='How often to print training info.') parser.add_argument('--max_total_steps', type=int, default=10**10, help='Maximum total number of iterations.') parser.add_argument('--log_device_placement', action="store_true", default=False, help='Whether to log device placement.') args = parser.parse_args() print("Starting...") print("Loading training data..") train_data = load_data(args.train_prefix, load_walks=True) print("Done loading training data..\n") from unsupervised_model import UnsupervisedModel model = UnsupervisedModel( args.train_prefix, args.model_name, args.model_size, args.learning_rate, args.epochs, args.dropout, args.weight_decay, args.max_degree, args.samples_1, args.samples_2, args.dim_1, args.dim_2, args.random_context, args.neg_sample_size, args.batch_size, args.identity_dim, args.save_embeddings, args.base_log_dir, args.validate_iter, args.validate_batch_size, args.gpu, args.print_every, args.max_total_steps, args.log_device_placement) model.train(train_data) print("Finished.")
def main(): parser = argparse.ArgumentParser( description='Arguments for GraphSAGE concatenated ' + 'classifier model evaluation.') parser.add_argument( "classifier_name", choices=["KNN", "MLP", "MultinomialLogisticRegression"], help="The name of the classifier.") parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('model_checkpoint_citations', help='Name of the GraphSAGE model checkpoint ' + 'for the citations graph.') parser.add_argument('model_checkpoint_authors', help='Name of the GraphSAGE model checkpoint ' + 'for the authors graph.') parser.add_argument('train_prefix_citations', help='Name of the object file that stores the ' + 'citations training data.') parser.add_argument('train_prefix_authors', help='Name of the object file that stores the ' + 'authors training data.') parser.add_argument('model_name', choices=[ "graphsage_mean", "gcn", "graphsage_seq", "graphsage_maxpool", "graphsage_meanpool" ], help="Model names.") parser.add_argument('--model_size', choices=["small", "big"], default="small", help="Can be big or small; model specific def'ns") parser.add_argument('--learning_rate', type=float, default=0.00001, help='Initial learning rate.') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train.') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout rate (1 - keep probability).') parser.add_argument('--weight_decay', type=float, default=0.0, help='Weight for l2 loss on embedding matrix.') parser.add_argument('--max_degree', type=int, default=100, help='Maximum node degree.') parser.add_argument('--samples_1', type=int, default=25, help='Number of samples in layer 1.') parser.add_argument('--samples_2', type=int, default=10, help='Number of users samples in layer 2.') parser.add_argument('--dim_1', type=int, default=128, help='Size of output dim ' + '(final is 2x this, if using concat)') parser.add_argument('--dim_2', type=int, default=128, help='Size of output dim ' + '(final is 2x this, if using concat)') parser.add_argument('--random_context', action="store_false", default=True, help='Whether to use random context or direct ' + 'edges.') parser.add_argument('--neg_sample_size', type=int, default=20, help='Number of negative samples.') parser.add_argument('--batch_size', type=int, default=512, help='Minibatch size.') parser.add_argument('--identity_dim', type=int, default=0, help='Set to positive value to use identity ' + 'embedding features of that dimension.') parser.add_argument('--save_embeddings', action="store_true", default=False, help='Whether to save embeddings for all nodes ' + 'after training') parser.add_argument('--base_log_dir', default='../../../data/processed/graphsage/', help='Base directory for logging and saving ' + 'embeddings') parser.add_argument('--validate_iter', type=int, default=5000, help='How often to run a validation minibatch.') parser.add_argument('--validate_batch_size', type=int, default=256, help='How many nodes per validation sample.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') parser.add_argument('--print_every', type=int, default=50, help='How often to print training info.') parser.add_argument('--max_total_steps', type=int, default=10**10, help='Maximum total number of iterations.') parser.add_argument('--log_device_placement', action="store_true", default=False, help='Whether to log device placement.') parser.add_argument('--recs', type=int, default=10, help='Number of recommendations.') args = parser.parse_args() print("Starting evaluation...") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) print("Using GPU {}.".format(str(args.gpu))) from GraphSAGEClassifierConcatEvaluation import GraphSAGEClassifierConcatEvaluation evaluation_model = GraphSAGEClassifierConcatEvaluation( args.classifier_name, args.embedding_type, args.model_name, args.model_size, args.learning_rate, args.gpu, args.recs) # Initialize GraphSAGE models graphsage_model_citations = UnsupervisedModel( args.train_prefix_citations, args.model_name, args.model_size, args.learning_rate, args.epochs, args.dropout, args.weight_decay, args.max_degree, args.samples_1, args.samples_2, args.dim_1, args.dim_2, args.random_context, args.neg_sample_size, args.batch_size, args.identity_dim, args.save_embeddings, args.base_log_dir, args.validate_iter, args.validate_batch_size, args.gpu, args.print_every, args.max_total_steps, args.log_device_placement) graphsage_model_authors = UnsupervisedModel( args.train_prefix_authors, args.model_name, args.model_size, args.learning_rate, args.epochs, args.dropout, args.weight_decay, args.max_degree, args.samples_1, args.samples_2, args.dim_1, args.dim_2, args.random_context, args.neg_sample_size, args.batch_size, args.identity_dim, args.save_embeddings, args.base_log_dir, args.validate_iter, args.validate_batch_size, args.gpu, args.print_every, args.max_total_steps, args.log_device_placement) # Train model if needed: if not evaluation_model._has_persistent_model(): print("Classifier not trained yet. Training now...") timer = Timer() timer.tic() evaluation_model.train(graphsage_model_citations, graphsage_model_authors) print("Training finished.") timer.toc() else: evaluation_model._load_model_classifier() # Load test data print("Loading test data...") query_test, query_test_authors, truth = evaluation_model.load_data() print("Loaded.") # Infer embeddings print("Inferring embeddings for citations graph.") queue_citations = mp.Queue() process_citations = mp.Process( target=evaluation_model.infer_embeddings, args=(query_test, None, "citations", graphsage_model_citations, args.model_checkpoint_citations, queue_citations)) process_citations.start() embeddings_citations = queue_citations.get() process_citations.join() process_citations.terminate() print("Inferring embeddings for authors graphs.") queue_authors = mp.Queue() process_authors = mp.Process(target=evaluation_model.infer_embeddings, args=(query_test, query_test_authors, "authors", graphsage_model_authors, args.model_checkpoint_authors, queue_authors)) process_authors.start() embeddings_authors = queue_authors.get() process_authors.join() process_authors.terminate() # Concatenate embeddings test_embeddings = np.concatenate( (embeddings_citations, embeddings_authors), axis=1) print("Computing predictions...") recommendation = evaluation_model.compute_predictions(test_embeddings) print("Predictions computed.") # Evaluate print("Evaluating...") evaluation = EvaluationContainer() evaluation.evaluate(recommendation, truth) print("Finished.")