def parse_graph_to_hin(self, first_graph, second_graph, third_graph=None, first_mapping_file='ec2compound.pkl', second_mapping_file=None, hin_file='hin.pkl', ospath='objectset', display_params: bool = True): if display_params: self.__print_arguments() time.sleep(2) print('\t>> Building a multi-modal graph...') logger.info('\t>> Building a multi-modal graph...') hin = self.__compose_graphs(first_graph=first_graph, second_graph=second_graph, first_adjaceny_matrix=first_mapping_file, third_graph=third_graph, second_adjaceny_matrix=second_mapping_file) if self.remove_isolates: print( '\t\t--> Removing {0:d} isolated nodes from the multi-modal graph...' .format(len(list(nx.isolates(hin))))) logger.info( '\t\t--> Removing {0:d} isolated nodes from the multi-modal graph...' .format(len(list(nx.isolates(hin))))) hin.remove_nodes_from(list(nx.isolates(hin))) save_data(data=hin, file_name=hin_file, save_path=ospath, tag='heterogeneous information network', mode='w+b')
def synthesize_report(X, sample_ids, y_pred, y_dict_ids, y_common_name, component_dict, labels_components, y_pred_score=None, batch_size=30, num_jobs=1, rsfolder="Results", rspath="../.", dspath="../.", file_name='labels'): if y_pred is None: raise Exception("Please provide two matrices as numpy matrix format: " "(num_samples, num_labels), representing pathway scores " "and the status of prediction as binary values.") num_samples = len(sample_ids) main_folder_path = os.path.join(rspath, rsfolder) list_batches = np.arange(start=0, stop=num_samples, step=batch_size) parallel = Parallel(n_jobs=num_jobs, verbose=0) # Delete the previous main folder and recreate a new one create_remove_dir(folder_path=main_folder_path) if y_pred_score is not None: results = parallel(delayed(__synthesize_report)(X[batch:batch + batch_size], sample_ids[batch:batch + batch_size], y_pred_score[batch:batch + batch_size], y_pred[batch:batch + batch_size], y_dict_ids, y_common_name, component_dict, labels_components, main_folder_path, batch_idx, len(list_batches)) for batch_idx, batch in enumerate(list_batches)) else: results = parallel(delayed(__synthesize_report)(X[batch:batch + batch_size], sample_ids[batch:batch + batch_size], y_pred_score, y_pred[batch:batch + batch_size], y_dict_ids, y_common_name, component_dict, labels_components, main_folder_path, batch_idx, len(list_batches)) for batch_idx, batch in enumerate(list_batches)) desc = '\t\t--> Synthesizing pathway reports {0:.4f}%...'.format(100) print(desc) y = list(zip(*results)) y = [item for lst in y for item in lst] print('\t\t--> Storing predictions (label) to: {0:s}'.format(file_name + '_labels.pkl')) save_data(data=y, file_name=file_name + '_labels.pkl', save_path=dspath, mode="wb", print_tag=False) y_dict_ids = dict((y_id, y_idx) for y_idx, y_id in y_dict_ids.items()) y_csr = np.zeros((len(y), len(y_dict_ids.keys()))) for idx, lst in enumerate(y): for item in lst: if item in y_dict_ids: y_csr[idx, y_dict_ids[item]] = 1 print('\t\t--> Storing predictions (label index) to: {0:s}'.format(file_name + '_y.pkl')) save_data(data=lil_matrix(y_csr), file_name=file_name + "_y.pkl", save_path=dspath, mode="wb", print_tag=False)
def generate_walks(self, constraint_type, just_type, just_memory_size, use_metapath_scheme, metapath_scheme='ECTCE', burn_in_phase: int = 10, burn_in_input_size: float = 0.5, hin='hin.pkl', save_file_name='hin', ospath='objectset', dspath='dataset', display_params: bool = True): if burn_in_phase < 0: burn_in_phase = 1 self.burn_in_phase = burn_in_phase if burn_in_input_size < 0 or burn_in_input_size > 1: burn_in_input_size = 0.1 self.burn_in_input_size = burn_in_input_size if use_metapath_scheme: if metapath_scheme.strip() != '' or metapath_scheme is not None: self.__check_metapath_validity(metapath_scheme=metapath_scheme) hin.metapath_scheme = metapath_scheme else: desc = '\n\t --> Error: Please provide a metapath scheme...' logger.warning(desc) raise Exception(desc) else: hin.metapath_scheme = None metapath_scheme = None if display_params: self.__print_arguments( use_metapath_scheme='Use a metapath scheme: {0}'.format( use_metapath_scheme), metapath_scheme='The specified metapath scheme: {0}'.format( metapath_scheme), constraint_type='Use node type: {0}'.format(constraint_type), just_type='Use JUST algorithm: {0}'.format(just_type), burn_in_phase='Burn in phase count: {0}'.format( self.burn_in_phase), burn_in_input_size= 'Subsampling size of the number of walks and length for burn in phase: {0}' .format(self.burn_in_input_size)) time.sleep(2) init_node_prob, type2index, type2prob = self.__init_probability(hin) hin.type2index = type2index hin.type2prob = type2prob hin.trans_metapath_scheme = use_metapath_scheme hin.trans_constraint_type = constraint_type hin.trans_just_type = just_type hin.q = self.q hin.p = self.p hin.learning_rate = self.learning_rate hin.num_walks = self.num_walks hin.walk_length = self.walk_length print('\t>> Calculate initial transition probabilities...') logger.info('\t>> Calculate initial transition probabilities...') N = (hin.number_of_nodes(), hin.number_of_nodes()) trans_prob = lil_matrix((N[0], N[0])) for curr_node, curr_node_data in hin.nodes(data=True): neigh_curr_node = np.array([ hin.nodes[edge[1]]['mapped_idx'] for edge in hin.edges(curr_node) ], dtype=np.int) trans_prob[curr_node_data['mapped_idx'], neigh_curr_node] = 1 trans_prob = lil_matrix(trans_prob.multiply(1 / trans_prob.sum(1))) print('\t>> Calculate transition probabilities...') logger.info('\t>> Calculate transition probabilities...') for burn_in_count in np.arange(start=1, stop=burn_in_phase + 1): desc = '\t\t## Burn in phase {0} (out of {1})...{2}'.format( burn_in_count, burn_in_phase, 20 * ' ') print(desc) for node_idx, node_data in enumerate(hin.nodes(data=True)): trans_prob = self._walks_per_node( node_idx=node_idx, node_curr=node_data[0], node_curr_data=node_data[1], hin=hin, just_memory_size=just_memory_size, trans_prob=trans_prob, burn_in_phase=True) node_prob = trans_prob.T.dot(init_node_prob) results = node_prob.sum() node_prob = node_prob.multiply(1 / results) hin.trans_prob = trans_prob for node in hin.nodes(data=True): attrs = {node[0]: {'weight': node_prob[node[1]['mapped_idx']]}} nx.set_node_attributes(hin, attrs) save_data(data=hin, file_name=save_file_name + '.pkl', save_path=ospath, tag='heterogeneous information network', mode='wb') print('\t>> Generate walks...') logger.info('\t>> Generate walks...') save_file_name = 'X_' + save_file_name + '.txt' if os.path.exists(os.path.join(dspath, save_file_name)): os.remove(path=os.path.join(dspath, save_file_name)) pool = Pool(processes=self.num_jobs) results = [ pool.apply_async(self._walks_per_node, args=(node_idx, node_data[0], node_data[1], hin, just_memory_size, trans_prob, dspath, save_file_name, False)) for node_idx, node_data in enumerate(hin.nodes(data=True)) ] output = [p.get() for p in results] desc = '\t\t## Stored generated walks to: {0}'.format(save_file_name) print(desc)
def _walks_per_node(self, node_idx, node_curr, node_curr_data, hin, just_memory_size, trans_prob, dspath=".", save_file_name=".", burn_in_phase=False): if len(list(hin.neighbors(node_curr))) == 0: desc = '\t\t\t--> Extracted walks for {0:.4f}% of nodes...'.format( ((node_idx + 1) / hin.number_of_nodes()) * 100) print(desc, end="\r") if burn_in_phase: return trans_prob else: return if hin.trans_metapath_scheme: metapath_scheme = None if node_curr_data['type'] in hin.metapath_scheme: frequent_scheme = hin.metapath_scheme[:-1] * 2 idx = str(frequent_scheme).index(node_curr_data['type']) metapath_scheme = frequent_scheme[idx:idx + len(hin.metapath_scheme) - 1] metapath_scheme = metapath_scheme * (self.walk_length // len(metapath_scheme)) if metapath_scheme is None: if burn_in_phase: return trans_prob else: return if node_curr_data['type'] != metapath_scheme[0]: desc = '\t\t\t--> Extracted walks for {0:.4f}% of nodes...'.format( ((node_idx + 1) / hin.number_of_nodes()) * 100) print(desc, end="\r") if burn_in_phase: return trans_prob else: return walk_length = self.walk_length num_walks = self.num_walks + 1 if burn_in_phase: num_walks = int(self.num_walks * self.burn_in_input_size) walk_length = int(self.walk_length * self.burn_in_input_size) if num_walks < 0: num_walks = 10 if walk_length < 0: walk_length = 10 for curr_walk in np.arange(start=1, stop=num_walks): X = [node_curr_data['mapped_idx']] prev_node = [node_curr] curr_node = node_curr curr_node_data = node_curr_data # The size of memory to hold the nodes types q_hist = collections.deque(maxlen=just_memory_size) q_hist.extend(node_curr_data['type']) for curr_length in np.arange(start=1, stop=walk_length): if curr_length > 1: list_neigh_idx_prev_node = [ hin.nodes[edge[1]]['mapped_idx'] for edge in hin.edges(prev_node[-2]) ] prev_node_idx = X[-2] else: list_neigh_idx_prev_node = [ hin.nodes[edge[1]]['mapped_idx'] for edge in hin.edges(prev_node[-1]) ] prev_node_idx = X[-1] if hin.trans_metapath_scheme: neigh_curr_node = [ (edge[1], edge[2]) for edge in hin.edges(curr_node, data='weight') if hin.nodes[ edge[1]]['type'] == metapath_scheme[curr_length] ] if len(neigh_curr_node) == 0: neigh_curr_node = [ (edge[1], edge[2]) for edge in hin.edges(curr_node, data='weight') if hin.nodes[edge[1]]['type'] == metapath_scheme[ curr_length - 1] ] else: neigh_curr_node = [ (edge[1], edge[2]) for edge in hin.edges(curr_node, data='weight') ] list_neigh_curr_node = np.array( [node[0] for node in neigh_curr_node]) neigh_type_curr_node = np.array( [hin.nodes[v]['type'] for v in list_neigh_curr_node]) neigh_idx_curr_node = np.array([ hin.nodes[node]['mapped_idx'] for node in list_neigh_curr_node ]) # Retrieve weights of nodes (usually set to 1.) at the start of burn in phase; # otherwise, retrieve the previous transition probabilities. trans_from_curr_node = trans_prob[ X[-1], neigh_idx_curr_node].toarray()[0] if hin.trans_constraint_type or hin.trans_just_type: # Compute the transition probability based on types of the current node's neighbours. # We further smooth the transition probabilities by adding EPSILON to weights of current # node, next node and current node type. trans_node_type = [ self.__alpha( next_node=hin.nodes[next_node]['mapped_idx'], next_node_type=neigh_type_curr_node[idx], curr_node_type=curr_node_data['type'], weight_curr_node=len(list( hin.neighbors(next_node))) + EPSILON, weight_curr_node_type=sum( neigh_type_curr_node == curr_node_data['type']) + EPSILON, weight_next_node_type=sum( neigh_type_curr_node == hin.nodes[next_node] ['type']) + EPSILON, explore_layer=hin.trans_just_type, constraint_type=True) for idx, next_node in enumerate(list_neigh_curr_node) ] trans_node_type = np.multiply(trans_node_type, trans_from_curr_node) if hin.trans_just_type and not hin.trans_metapath_scheme: if len(q_hist) == just_memory_size: available_types = set(q_hist) for t in available_types: # Explore within a layer more frequently as suggested by JUST; however, # the JUST algorithm is modified to explore a wider range when the memory # size in Q_hist is larger than the nodes types. # Note, when q == p then we recover the JUST algorithm. if hin.q != hin.p: weight_decay = 1 / q_hist.count(t) if q_hist.count(t) == int( just_memory_size * hin.q): weight_decay = -q_hist.count(t) else: weight_decay = -q_hist.count(t) tmp = trans_node_type[neigh_type_curr_node == t] * np.exp(weight_decay) trans_node_type[neigh_type_curr_node == t] = tmp trans_node_type = trans_node_type / np.sum(trans_node_type) node_type = np.random.choice(neigh_type_curr_node, size=1, p=trans_node_type) # Include only those nodes that have the same chosen type. list_neigh_curr_node = [ (edge[1], edge[2]) for edge in hin.edges(curr_node, data='weight') if hin.nodes[edge[1]]['type'] == node_type ] neigh_idx_curr_node = np.array([ hin.nodes[node[0]]['mapped_idx'] for node in list_neigh_curr_node ]) # Retrieve weights of nodes (usually set to 1.) at the start of burn in phase; # otherwise, retrieve the previous transition probabilities. trans_from_curr_node = trans_prob[ X[-1], neigh_idx_curr_node].toarray()[0] list_neigh_curr_node = np.array( [node[0] for node in list_neigh_curr_node]) # Compute the transition probability of the current node's neighbours based on the chosen type. # We further smooth the transition probabilities by adding EPSILON to weights of current node. trans_prob_next_node = [ self.__alpha( next_node=hin.nodes[next_node]['mapped_idx'], prev_node=prev_node_idx, neighbours_prev_node=list_neigh_idx_prev_node, weight_curr_node=len(list(hin.neighbors(next_node))) + EPSILON) for next_node in list_neigh_curr_node ] trans_prob_next_node = np.multiply(trans_prob_next_node, trans_from_curr_node) trans_prob_next_node = trans_prob_next_node / np.sum( trans_prob_next_node) next_node = np.random.choice(neigh_idx_curr_node, 1, p=trans_prob_next_node)[0] # If the transition probability is not computed then initialize it with the most recent # estimation; otherwise update the existing one. tmp = trans_prob_next_node[neigh_idx_curr_node == next_node] trans_prob[X[-1], next_node] = trans_prob[ X[-1], next_node] + tmp * self.learning_rate curr_node = list_neigh_curr_node[neigh_idx_curr_node == next_node][0] curr_node_data = hin.nodes[curr_node] # Store the sequence of simulated walks and nodes in Q hist upto predefined memory size. X = X + [next_node] prev_node = prev_node + [curr_node] q_hist.extend(hin.nodes[curr_node]['type']) # Save the generated instances into the .txt file if not burn_in_phase: X = '\t'.join([str(v) for v in X]) save_data(data=X + '\n', file_name=save_file_name, save_path=dspath, mode='a', w_string=True, print_tag=False) desc = '\t\t\t--> Extracted walks for {0:.4f}% of nodes...'.format( ((node_idx + 1) / hin.number_of_nodes()) * 100) print(desc, end="\r") if burn_in_phase: return trans_prob
def __train(arg): # Setup the number of operations to employ steps = 1 # Whether to display parameters at every operation display_params = True ########################################################################################################## ###################### PREPROCESSING ###################### ########################################################################################################## if arg.define_bags: print("\n{0})- Construct bags_labels centroids...".format(steps)) steps = steps + 1 # load a hin file hin = load_data(file_name=arg.hin_name, load_path=arg.mdpath, tag="heterogeneous information network") node2idx_path2vec = dict( (node[0], node[1]["mapped_idx"]) for node in hin.nodes(data=True)) # map pathways indices of vocab to path2vec pathways indices vocab = load_data(file_name=arg.vocab_name, load_path=arg.dspath, tag="vocabulary") idxvocab = np.array( [idx for idx, v in vocab.items() if v in node2idx_path2vec]) del hin # define pathways 2 bags_labels phi = np.load(file=os.path.join(arg.mdpath, arg.bag_phi_name)) phi = phi[phi.files[0]] bags_labels = np.argsort(-phi) bags_labels = bags_labels[:, :arg.top_k] labels_distr_idx = np.array( [[pathway for pathway in bag if pathway in idxvocab] for bag in bags_labels]) bags_labels = preprocessing.MultiLabelBinarizer().fit_transform( labels_distr_idx) labels_distr_idx = [[ list(idxvocab).index(label_idx) for label_idx in bag_idx ] for bag_idx in labels_distr_idx] # get trimmed phi distributions phi = -np.sort(-phi) phi = phi[:, :arg.top_k] # calculate correlation sigma = np.load(file=os.path.join(arg.mdpath, arg.bag_sigma_name)) sigma = sigma[sigma.files[0]] sigma[sigma < 0] = EPSILON C = np.diag(np.sqrt(np.diag(sigma))) C_inv = np.linalg.inv(C) rho = np.dot(np.dot(C_inv, sigma), C_inv) min_rho = np.min(rho) max_rho = np.max(rho) rho = rho - min_rho rho = rho / (max_rho - min_rho) # extracting pathway features path2vec_features = np.load( file=os.path.join(arg.mdpath, arg.features_name)) path2vec_features = path2vec_features[path2vec_features.files[0]] pathways_idx = np.array([ node2idx_path2vec[v] for idx, v in vocab.items() if v in node2idx_path2vec ]) features = path2vec_features[pathways_idx, :] features = features / np.linalg.norm(features, axis=1)[:, np.newaxis] # get centroids of bags_labels C = np.dot(bags_labels, features) / \ np.sum(bags_labels, axis=1)[:, np.newaxis] C = arg.alpha * C # save files np.savez(os.path.join(arg.dspath, arg.file_name + "_exp_phi_trim.npz"), phi) np.savez(os.path.join(arg.dspath, arg.file_name + "_rho.npz"), rho) np.savez(os.path.join(arg.dspath, arg.file_name + "_features.npz"), features) np.savez(os.path.join(arg.dspath, arg.file_name + "_bag_centroid.npz"), C) save_data(data=bags_labels, file_name=arg.file_name + "_bag_pathway.pkl", save_path=arg.dspath, tag="bags_labels with associated pathways", mode="wb") save_data(data=idxvocab, file_name=arg.file_name + "_idxvocab.pkl", save_path=arg.dspath, tag="pathway ids to pathway features ids", mode="wb") save_data(data=labels_distr_idx, file_name=arg.file_name + "_labels_distr_idx.pkl", save_path=arg.dspath, tag="bags labels batch_idx with associated pathways", mode="wb") print("\t>> Done...") if arg.recover_max_bags: print("\n{0})- Recover maximum expected bags_labels...".format(steps)) steps = steps + 1 # load files features = np.load(file=os.path.join(arg.dspath, arg.file_name + "_features.npz")) features = features[features.files[0]] C = np.load(file=os.path.join(arg.dspath, arg.file_name + "_bag_centroid.npz")) C = C[C.files[0]] bags_labels = load_data(file_name=arg.file_name + "_bag_pathway.pkl", load_path=arg.dspath, tag="bags_labels with associated pathways") idxvocab = load_data(file_name=arg.file_name + "_idxvocab.pkl", load_path=arg.dspath, tag="pathway ids to pathway features ids") y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y") y_Bag = np.zeros((y.shape[0], C.shape[0]), dtype=np.int) for s_idx, sample in enumerate(y): desc = "\t>> Recovering maximum number of bags_labels: {0:.2f}%...".format( ((s_idx + 1) / y.shape[0]) * 100) if (s_idx + 1) != y.shape[0]: print(desc, end="\r") if (s_idx + 1) == y.shape[0]: print(desc) pathways = np.zeros((len(list(idxvocab), )), dtype=np.int) for ptwy_idx in sample.rows[0]: if ptwy_idx in idxvocab: pathways[list(idxvocab).index(ptwy_idx)] = 1 pathways = np.diag(pathways) features = pathways @ features sample_bag_features = np.dot(bags_labels, features) / np.sum( bags_labels, axis=1)[:, np.newaxis] sample_bag_features = arg.alpha * sample_bag_features np.nan_to_num(sample_bag_features, copy=False) cos = cosine_distances(C, sample_bag_features) / 2 cos = np.diag(cos) B_idx = np.argwhere(cos > arg.v_cos) B_idx = B_idx.reshape((B_idx.shape[0], )) y_Bag[s_idx, B_idx] = 1 # save dataset with maximum bags_labels save_data(data=lil_matrix(y_Bag), file_name=arg.file_name + "_B.pkl", save_path=arg.dspath, mode="wb", tag="bags to labels data") print("\t>> Done...") ########################################################################################################## ###################### TRAIN ###################### ########################################################################################################## if arg.train: print("\n{0})- Training {1} dataset using reMap model...".format( steps, arg.y_name)) steps = steps + 1 # load files print("\t>> Loading files...") y_Bag = load_data(file_name=arg.yB_name, load_path=arg.dspath, tag="B") # set randomly bags if arg.random_allocation: num_samples = y_Bag.shape[0] y_Bag = y_Bag.toarray() for bag_idx in np.arange(y_Bag.shape[1]): if np.sum(y_Bag[:, bag_idx]) == num_samples: y_Bag[:, bag_idx] = np.random.binomial( 1, arg.theta_bern, num_samples) y_Bag[y_Bag == 0] = -1 y_Bag = lil_matrix(y_Bag) # save dataset with maximum bags_labels save_data(data=lil_matrix(y_Bag), file_name=arg.model_name + "_B.pkl", save_path=arg.dspath, mode="wb", tag="bags to labels data") else: features = np.load( file=os.path.join(arg.dspath, arg.features_name)) features = features[features.files[0]] C = np.load(file=os.path.join(arg.dspath, arg.bag_centroid_name)) C = C[C.files[0]] rho = np.load(file=os.path.join(arg.dspath, arg.rho_name)) rho = rho[rho.files[0]] bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.dspath, tag="bags_labels with associated pathways") X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X") y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y") model = reMap(alpha=arg.alpha, binarize_input_feature=arg.binarize_input_feature, fit_intercept=arg.fit_intercept, decision_threshold=arg.decision_threshold, learning_type=arg.learning_type, lr=arg.lr, lr0=arg.lr0, forgetting_rate=arg.forgetting_rate, delay_factor=arg.delay_factor, max_sampling=arg.max_sampling, subsample_input_size=arg.ssample_input_size, subsample_labels_size=arg.ssample_label_size, cost_subsample_size=arg.calc_subsample_size, min_bags=arg.min_bags, max_bags=arg.max_bags, score_strategy=arg.score_strategy, loss_threshold=arg.loss_threshold, early_stop=arg.early_stop, pi=arg.pi, calc_bag_cost=arg.calc_bag_cost, calc_label_cost=arg.calc_label_cost, calc_total_cost=arg.calc_total_cost, varomega=arg.varomega, varrho=arg.varrho, min_negatives_ratio=arg.min_negatives_ratio, lambdas=arg.lambdas, label_bag_sim=arg.label_bag_sim, label_closeness_sim=arg.label_closeness_sim, corr_bag_sim=arg.corr_bag_sim, corr_label_sim=arg.corr_label_sim, corr_input_sim=arg.corr_input_sim, batch=arg.batch, num_epochs=arg.num_epochs, num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle, random_state=arg.random_state, log_path=arg.logpath) model.fit(X=X, y=y, y_Bag=y_Bag, bags_labels=bags_labels, bags_correlation=rho, label_features=features, centroids=C, model_name=arg.model_name, model_path=arg.mdpath, result_path=arg.rspath, snapshot_history=arg.snapshot_history, display_params=display_params) ########################################################################################################## ###################### TRANSFORM ###################### ########################################################################################################## if arg.transform: print("\n{0})- Predicting dataset using a pre-trained reMap model...". format(steps)) # load files print("\t>> Loading files...") features = np.load(file=os.path.join(arg.dspath, arg.features_name)) features = features[features.files[0]] C = np.load(file=os.path.join(arg.dspath, arg.bag_centroid_name)) C = C[C.files[0]] rho = np.load(file=os.path.join(arg.dspath, arg.rho_name)) rho = rho[rho.files[0]] bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.dspath, tag="bags_labels with associated pathways") # load data X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X") y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y") model = load_data(file_name=arg.model_name + ".pkl", load_path=arg.mdpath, tag="reMap model") print("\t>> Predict bags...") y_Bag = model.transform(X=X, y=y, bags_labels=bags_labels, bags_correlation=rho, label_features=features, centroids=C, subsample_labels_size=arg.ssample_label_size, max_sampling=arg.max_sampling, snapshot_history=arg.snapshot_history, decision_threshold=arg.decision_threshold, batch_size=arg.batch, num_jobs=arg.num_jobs, file_name=arg.file_name, result_path=arg.rspath) # save dataset with maximum bags_labels save_data(data=lil_matrix(y_Bag), file_name=arg.file_name + "_B.pkl", save_path=arg.dspath, mode="wb", tag="bags to labels data")
def fit(self, X, M=None, features=None, model_name='cbt', model_path="../../model", result_path=".", display_params: bool = True): if X is None: raise Exception("Please provide a dataset.") assert X.shape[1] == self.num_features X = self.__check_non_neg_array(X, "SparseCorrelatedBagPathway.fit") if not self.collapse2ctm: if features is not None: assert X.shape[1] == features.shape[0] else: features = np.ones((self.num_features, 20)) features = features / np.linalg.norm(features, axis=1)[:, np.newaxis] # collect properties from data self.__init_latent_variables() num_samples = int(X.shape[0] * self.subsample_input_size) list_batches = np.arange(start=0, stop=num_samples, step=self.batch) if display_params: self.__print_arguments() time.sleep(2) if not self.collapse2ctm: if M is not None: assert M.shape == X.shape omega = M + self.xi_vec else: omega = np.zeros((X.shape[0], self.num_features)) + self.xi_vec omega = omega / np.sum(omega, axis=1)[:, np.newaxis] cost_file_name = model_name + "_cost.txt" save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False) print('\t>> Training by SOAP model...') logger.info('\t>> Training by SOAP model...') n_epochs = self.num_epochs + 1 old_bound = np.inf timeref = time.time() for epoch in np.arange(start=1, stop=n_epochs): desc = '\t {0:d})- Epoch count ({0:d}/{1:d})...'.format(epoch, n_epochs - 1) print(desc) logger.info(desc) learning_rate = np.power((epoch + self.delay_factor), -self.forgetting_rate) # Subsample dataset idx = np.random.choice(X.shape[0], num_samples, False) start_epoch = time.time() # E-step if not self.collapse2ctm: sstats, tmp = self.__batch_e_step(X=X[idx, :], omega=omega[idx, :], features=features, list_batches=list_batches) else: sstats, tmp = self.__batch_e_step(X=X[idx, :], omega=None, features=features, list_batches=list_batches) del tmp # M-step self.__m_step(sstats=sstats, learning_rate=learning_rate, num_samples=num_samples) end_epoch = time.time() self.is_fit = True # Compute approx bound if not self.collapse2ctm: new_bound = self.perplexity(X=X[idx, :], M=omega[idx, :], features=features, sstats=sstats) else: new_bound = self.perplexity(X=X[idx, :], M=M, features=features, sstats=sstats) print('\t\t## Epoch {0} took {1} seconds...'.format(epoch, round(end_epoch - start_epoch, 3))) logger.info('\t\t## Epoch {0} took {1} seconds...'.format(epoch, round(end_epoch - start_epoch, 3))) data = str(epoch) + '\t' + str(round(end_epoch - start_epoch, 3)) + '\t' + str(new_bound) + '\n' save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True, print_tag=False) # Save models parameters based on test frequencies if (epoch % self.display_interval) == 0 or epoch == 1 or epoch == n_epochs - 1: print('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_bound, old_bound)) logger.info('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_bound, old_bound)) if new_bound <= old_bound or epoch == n_epochs - 1: phi_file_name = model_name + '_exp_phi.npz' sigma_file_name = model_name + '_sigma.npz' mu_file_name = model_name + '_mu.npz' model_file_name = model_name + '.pkl' if epoch == n_epochs - 1: phi_file_name = model_name + '_exp_phi_final.npz' sigma_file_name = model_name + '_sigma_final.npz' mu_file_name = model_name + '_mu_final.npz' model_file_name = model_name + '_final.pkl' print('\t\t --> Storing the SOAP phi to: {0:s}'.format(phi_file_name)) logger.info('\t\t --> Storing the SOAP phi to: {0:s}'.format(phi_file_name)) np.savez(os.path.join(model_path, phi_file_name), self.phi) print('\t\t --> Storing the SOAP sigma to: {0:s}'.format(sigma_file_name)) logger.info('\t\t --> Storing the SOAP sigma to: {0:s}'.format(sigma_file_name)) np.savez(os.path.join(model_path, sigma_file_name), self.sigma) print('\t\t --> Storing the SOAP mu to: {0:s}'.format(mu_file_name)) logger.info('\t\t --> Storing the SOAP mu to: {0:s}'.format(mu_file_name)) np.savez(os.path.join(model_path, mu_file_name), self.mu) print('\t\t --> Storing the SOAP model to: {0:s}'.format(model_file_name)) logger.info('\t\t --> Storing the SOAP model to: {0:s}'.format(model_file_name)) save_data(data=copy.copy(self), file_name=model_file_name, save_path=model_path, mode="wb", print_tag=False) old_bound = new_bound print('\t --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3))) logger.info('\t --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
def fit(self, X, model_name='CTM', model_path="../../model", result_path=".", display_params: bool = True): if X is None: raise Exception("Please provide a dataset.") assert X.shape[1] == self.num_features X = self.__check_non_neg_array(X, "CorrelatedTopicModel.fit") # collect properties from data self.__init_latent_variables() num_samples = int(X.shape[0] * self.subsample_input_size) list_batches = np.arange(start=0, stop=num_samples, step=self.batch) if display_params: self.__print_arguments() time.sleep(2) cost_file_name = model_name + "_cost.txt" save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False) print('\t>> Training by CTM model...') logger.info('\t>> Training by CTM model...') n_epochs = self.num_epochs + 1 old_bound = np.inf timeref = time.time() for epoch in np.arange(start=1, stop=n_epochs): desc = '\t {0:d})- Epoch count ({0:d}/{1:d})...'.format( epoch, n_epochs - 1) print(desc) logger.info(desc) learning_rate = np.power((epoch + self.delay_factor), -self.forgetting_rate) # Subsample dataset idx = np.random.choice(X.shape[0], num_samples, False) start_epoch = time.time() # E-step sstats, tmp = self.__batch_e_step(X=X[idx, :], list_batches=list_batches) del tmp # M-step self.__m_step(sstats=sstats, learning_rate=learning_rate, num_samples=num_samples) end_epoch = time.time() self.is_fit = True # Compute predictive perplexity new_bound = self.perplexity(X=X[idx, :], sstats=sstats["phi_sstats"]) print('\t\t## Epoch {0} took {1} seconds...'.format( epoch, round(end_epoch - start_epoch, 3))) logger.info('\t\t## Epoch {0} took {1} seconds...'.format( epoch, round(end_epoch - start_epoch, 3))) data = str(epoch) + '\t' + str(round( end_epoch - start_epoch, 3)) + '\t' + str(new_bound) + '\n' save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True, print_tag=False) # Save models parameters based on test frequencies if (epoch % self.display_interval ) == 0 or epoch == 1 or epoch == n_epochs - 1: print('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format( new_bound, old_bound)) logger.info( '\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format( new_bound, old_bound)) if new_bound <= old_bound or epoch == n_epochs - 1: omega_file_name = model_name + '_exp_omega.npz' sigma_file_name = model_name + '_sigma.npz' mu_file_name = model_name + '_mu.npz' model_file_name = model_name + '.pkl' if epoch == n_epochs - 1: omega_file_name = model_name + '_exp_omega_final.npz' sigma_file_name = model_name + '_sigma_final.npz' mu_file_name = model_name + '_mu_final.npz' model_file_name = model_name + '_final.pkl' print('\t\t --> Storing the CTM omega to: {0:s}'.format( omega_file_name)) logger.info( '\t\t --> Storing the CTM omega to: {0:s}'.format( omega_file_name)) np.savez(os.path.join(model_path, omega_file_name), self.omega) print('\t\t --> Storing the CTM sigma to: {0:s}'.format( sigma_file_name)) logger.info( '\t\t --> Storing the CTM sigma to: {0:s}'.format( sigma_file_name)) np.savez(os.path.join(model_path, sigma_file_name), self.sigma) print('\t\t --> Storing the CTM mu to: {0:s}'.format( mu_file_name)) logger.info( '\t\t --> Storing the CTM mu to: {0:s}'.format( mu_file_name)) np.savez(os.path.join(model_path, mu_file_name), self.mu) print('\t\t --> Storing the CTM model to: {0:s}'.format( model_file_name)) logger.info( '\t\t --> Storing the CTM model to: {0:s}'.format( model_file_name)) save_data(data=copy.copy(self), file_name=model_file_name, save_path=model_path, mode="wb", print_tag=False) old_bound = new_bound print('\t --> Training consumed %.2f mintues' % (round( (time.time() - timeref) / 60., 3))) logger.info('\t --> Training consumed %.2f mintues' % (round( (time.time() - timeref) / 60., 3)))
def __train(arg): # Setup the number of operations to employ steps = 1 # Whether to display parameters at every operation display_params = True if arg.preprocess_dataset: print('\n{0})- Preprocessing dataset...'.format(steps)) steps = steps + 1 print('\t>> Loading files...') # load a biocyc file data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object') # extract pathway ids pathway_dict = data_object["pathway_id"] ec_dict = data_object["ec_id"] del data_object # load a hin file hin = load_data(file_name=arg.hin_name, load_path=arg.ospath, tag='heterogeneous information network') # get path2vec mapping node2idx_path2vec = dict((node[0], node[1]['mapped_idx']) for node in hin.nodes(data=True)) # get pathway2ec mapping node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)] Adj = nx.adj_matrix(G=hin) del hin # load pathway2ec mapping matrix pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath) path2vec_features = np.load(file=os.path.join(arg.mdpath, arg.features_name)) # extracting pathway and ec features labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M') path2vec_features = path2vec_features[path2vec_features.files[0]] pathways_idx = np.array([node2idx_path2vec[v] for v, idx in pathway_dict.items() if v in node2idx_path2vec]) P = path2vec_features[pathways_idx, :] tmp = [idx for v, idx in ec_dict.items() if v in node2idx_pathway2ec] ec_idx = np.array([idx for idx in tmp if len(np.where(pathway2ec_idx == idx)[0]) > 0]) E = path2vec_features[ec_idx, :] # constraint features space between 0 to 1 to avoid negative results min_rho = np.min(P) max_rho = np.max(P) P = P - min_rho P = P / (max_rho - min_rho) P = P / np.linalg.norm(P, axis=1)[:, np.newaxis] min_rho = np.min(E) max_rho = np.max(E) E = E - min_rho E = E / (max_rho - min_rho) E = E / np.linalg.norm(E, axis=1)[:, np.newaxis] # building A and B matrices lil_matrix.setdiag(Adj, 0) A = Adj[pathways_idx[:, None], pathways_idx] A = A / A.sum(1) A = np.nan_to_num(A) B = Adj[ec_idx[:, None], ec_idx] B = B / B.sum(1) B = np.nan_to_num(B) ## train size if arg.ssample_input_size < 1: # add white noise to M train_size = labels_components.shape[0] * arg.ssample_input_size idx = np.random.choice(a=np.arange(labels_components.shape[0]), size=int(train_size), replace=False) labels_components = labels_components.toarray() labels_components[idx] = np.zeros((idx.shape[0], labels_components.shape[1])) if arg.white_links: if arg.ssample_input_size < 1: # add white noise to A train_size = A.shape[0] * arg.ssample_input_size idx = np.random.choice(a=np.arange(A.shape[0]), size=int(train_size), replace=False) A = lil_matrix(A).toarray() tmp = np.zeros((idx.shape[0], A.shape[0])) A[idx] = tmp A[:, idx] = tmp.T # add white noise to B train_size = B.shape[0] * arg.ssample_input_size idx = np.random.choice(a=np.arange(B.shape[0]), size=int(train_size), replace=False) B = lil_matrix(B).toarray() tmp = np.zeros((idx.shape[0], B.shape[0])) B[idx] = tmp B[:, idx] = tmp.T # save files print('\t>> Saving files...') save_data(data=lil_matrix(labels_components), file_name=arg.M_name, save_path=arg.dspath, tag="M", mode="wb") save_data(data=lil_matrix(P), file_name=arg.P_name, save_path=arg.dspath, tag="P", mode="wb") save_data(data=lil_matrix(E), file_name=arg.E_name, save_path=arg.dspath, tag="E", mode="wb") save_data(data=lil_matrix(A), file_name=arg.A_name, save_path=arg.dspath, tag="A", mode="wb") save_data(data=lil_matrix(B), file_name=arg.B_name, save_path=arg.dspath, tag="B", mode="wb") print('\t>> Done...') ########################################################################################################## ###################### TRAIN USING triUMPF ###################### ########################################################################################################## if arg.train: print('\n{0})- Training {1} dataset using triUMPF model...'.format(steps, arg.y_name)) steps = steps + 1 # load files print('\t>> Loading files...') labels_components, W, H, P, E, A, B, X, y = None, None, None, None, None, None, None, None, None if arg.no_decomposition: W = load_data(file_name=arg.W_name, load_path=arg.mdpath, tag='W') H = load_data(file_name=arg.H_name, load_path=arg.mdpath, tag='H') else: labels_components = load_data(file_name=arg.M_name, load_path=arg.dspath, tag='M') if arg.fit_features: P = load_data(file_name=arg.P_name, load_path=arg.dspath, tag='P') E = load_data(file_name=arg.E_name, load_path=arg.dspath, tag='E') if arg.fit_comm: if not arg.fit_features: P = load_data(file_name=arg.P_name, load_path=arg.dspath, tag='P') E = load_data(file_name=arg.E_name, load_path=arg.dspath, tag='E') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag='X') y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag='X') A = load_data(file_name=arg.A_name, load_path=arg.dspath, tag='A') B = load_data(file_name=arg.B_name, load_path=arg.dspath, tag='B') model = triUMPF(num_components=arg.num_components, num_communities_p=arg.num_communities_p, num_communities_e=arg.num_communities_e, proxy_order_p=arg.proxy_order_p, proxy_order_e=arg.proxy_order_e, mu_omega=arg.mu_omega, mu_gamma=arg.mu_gamma, fit_features=arg.fit_features, fit_comm=arg.fit_comm, fit_pure_comm=arg.fit_pure_comm, normalize_input_feature=arg.normalize_input_feature, binarize_input_feature=arg.binarize_input_feature, use_external_features=arg.use_external_features, cutting_point=arg.cutting_point, fit_intercept=arg.fit_intercept, alpha=arg.alpha, beta=arg.beta, rho=arg.rho, lambdas=arg.lambdas, eps=arg.eps, early_stop=arg.early_stop, penalty=arg.penalty, alpha_elastic=arg.alpha_elastic, l1_ratio=arg.l1_ratio, loss_threshold=arg.loss_threshold, decision_threshold=arg.decision_threshold, subsample_input_size=arg.ssample_input_size, subsample_labels_size=arg.ssample_label_size, learning_type=arg.learning_type, lr=arg.lr, lr0=arg.lr0, delay_factor=arg.delay_factor, forgetting_rate=arg.forgetting_rate, batch=arg.batch, max_inner_iter=arg.max_inner_iter, num_epochs=arg.num_epochs, num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle, random_state=arg.random_state, log_path=arg.logpath) model.fit(M=labels_components, W=W, H=H, X=X, y=y, P=P, E=E, A=A, B=B, model_name=arg.model_name, model_path=arg.mdpath, result_path=arg.rspath, display_params=display_params) ########################################################################################################## ###################### PREDICT USING triUMPF ###################### ########################################################################################################## if arg.predict: print('\n{0})- Predicting using a pre-trained triUMPF model...'.format(steps)) if arg.pathway_report: print('\t>> Loading biocyc object...') # load a biocyc file data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object', print_tag=False) pathway_dict = data_object["pathway_id"] pathway_common_names = dict((pidx, data_object['processed_kb']['metacyc'][5][pid][0][1]) for pid, pidx in pathway_dict.items() if pid in data_object['processed_kb']['metacyc'][5]) ec_dict = data_object['ec_id'] del data_object pathway_dict = dict((idx, id) for id, idx in pathway_dict.items()) ec_dict = dict((idx, id) for id, idx in ec_dict.items()) labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M') print('\t>> Loading label to component mapping file object...') pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath, print_tag=False) pathway2ec_idx = list(pathway2ec_idx) tmp = list(ec_dict.keys()) ec_dict = dict((idx, ec_dict[tmp.index(ec)]) for idx, ec in enumerate(pathway2ec_idx)) if arg.extract_pf: X, sample_ids = parse_files(ec_dict=ec_dict, input_folder=arg.dsfolder, rsfolder=arg.rsfolder, rspath=arg.rspath, num_jobs=arg.num_jobs) print('\t>> Storing X and sample_ids...') save_data(data=X, file_name=arg.file_name + '_X.pkl', save_path=arg.dspath, tag='the pf dataset (X)', mode='w+b', print_tag=False) save_data(data=sample_ids, file_name=arg.file_name + '_ids.pkl', save_path=arg.dspath, tag='samples ids', mode='w+b', print_tag=False) if arg.build_features: # load a hin file print('\t>> Loading heterogeneous information network file...') hin = load_data(file_name=arg.hin_name, load_path=arg.ospath, tag='heterogeneous information network', print_tag=False) # get pathway2ec mapping node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)] del hin print('\t>> Loading path2vec_features file...') path2vec_features = np.load(file=os.path.join(arg.mdpath, arg.features_name)) __build_features(X=X, pathwat_dict=pathway_dict, ec_dict=ec_dict, labels_components=labels_components, node2idx_pathway2ec=node2idx_pathway2ec, path2vec_features=path2vec_features, file_name=arg.file_name, dspath=arg.dspath, batch_size=arg.batch, num_jobs=arg.num_jobs) # load files print('\t>> Loading necessary files......') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X") sample_ids = np.arange(X.shape[0]) if arg.samples_ids in os.listdir(arg.dspath): sample_ids = load_data(file_name=arg.samples_ids, load_path=arg.dspath, tag="samples ids") # load model model = load_data(file_name=arg.model_name + '.pkl', load_path=arg.mdpath, tag='triUMPF model') # predict y_pred = model.predict(X=X.toarray(), estimate_prob=False, apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold, top_k=arg.top_k, batch_size=arg.batch, num_jobs=arg.num_jobs) # labels prediction score y_pred_score = model.predict(X=X.toarray(), estimate_prob=True, apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold, top_k=arg.top_k, batch_size=arg.batch, num_jobs=arg.num_jobs) if arg.pathway_report: print('\t>> Synthesizing pathway reports...') synthesize_report(X=X[:, :arg.cutting_point], sample_ids=sample_ids, y_pred=y_pred, y_dict_ids=pathway_dict, y_common_name=pathway_common_names, component_dict=ec_dict, labels_components=labels_components, y_pred_score=y_pred_score, batch_size=arg.batch, num_jobs=arg.num_jobs, rsfolder=arg.rsfolder, rspath=arg.rspath, dspath=arg.dspath, file_name=arg.file_name + '_triumpf') else: print('\t>> Storing predictions (label index) to: {0:s}'.format(arg.file_name + '_triumpf_y.pkl')) save_data(data=y_pred, file_name=arg.file_name + "_triumpf_y.pkl", save_path=arg.dspath, mode="wb", print_tag=False)
def __build_features(X, pathwat_dict, ec_dict, labels_components, node2idx_pathway2ec, path2vec_features, file_name, dspath, batch_size=100, num_jobs=1): tmp = lil_matrix.copy(X) print('\t>> Build abundance and coverage features...') list_batches = np.arange(start=0, stop=tmp.shape[0], step=batch_size) total_progress = len(list_batches) * len(pathwat_dict.keys()) parallel = Parallel(n_jobs=num_jobs, verbose=0) results = parallel(delayed(compute_abd_cov)(tmp[batch:batch + batch_size], labels_components, pathwat_dict, None, batch_idx, total_progress) for batch_idx, batch in enumerate(list_batches)) desc = '\t\t--> Building {0:.4f}%...'.format((100)) print(desc) abd, cov = zip(*results) abd = np.vstack(abd) cov = np.vstack(cov) del results abd = preprocessing.normalize(abd) print('\t>> Use pathway2vec EC features...') path2vec_features = path2vec_features[path2vec_features.files[0]] path2vec_features = path2vec_features / np.linalg.norm(path2vec_features, axis=1)[:, np.newaxis] ec_features = [idx for idx, v in ec_dict.items() if v in node2idx_pathway2ec] path2vec_features = path2vec_features[ec_features, :] ec_features = [np.mean(path2vec_features[row.rows[0]] * np.array(row.data[0])[:, None], axis=0) for idx, row in enumerate(X)] save_data(data=lil_matrix(ec_features), file_name=file_name + "_Xp.pkl", save_path=dspath, mode="wb", tag="transformed instances to ec features") X = lil_matrix(hstack((tmp, ec_features))) save_data(data=X, file_name=file_name + "_Xe.pkl", save_path=dspath, mode="wb", tag="concatenated ec features with instances") X = lil_matrix(hstack((tmp, abd))) save_data(data=X, file_name=file_name + "_Xa.pkl", save_path=dspath, mode="wb", tag="concatenated abundance features with instances") X = lil_matrix(hstack((tmp, cov))) save_data(data=X, file_name=file_name + "_Xc.pkl", save_path=dspath, mode="wb", tag="concatenated coverage features with instances") X = lil_matrix(hstack((tmp, ec_features))) X = lil_matrix(hstack((X, abd))) save_data(data=X, file_name=file_name + "_Xea.pkl", save_path=dspath, mode="wb", tag="concatenated ec and abundance features with instances") X = lil_matrix(hstack((tmp, ec_features))) X = lil_matrix(hstack((X, cov))) save_data(data=X, file_name=file_name + "_Xec.pkl", save_path=dspath, mode="wb", tag="concatenated ec and coverage features with instances") X = lil_matrix(hstack((tmp, ec_features))) X = lil_matrix(hstack((X, abd))) X = lil_matrix(hstack((X, cov))) save_data(data=X, file_name=file_name + "_Xm.pkl", save_path=dspath, mode="wb", tag="concatenated ec, abundance, and coverage features features with instances")
def score(y_true, y_pred, item_lst, six_db=False, A=1, B=1, C=1, top_k=150, mode='a', file_name='results.txt', save_path=''): idx_lst = [1] if six_db: item_lst = [ 'AraCyc', 'EcoCyc', 'HumanCyc', 'LeishCyc', 'TrypanoCyc', 'YeastCyc' ] if y_true.shape[0] == 4: item_lst = ['AraCyc', 'EcoCyc', 'HumanCyc', 'YeastCyc'] idx_lst = [idx for idx in np.arange(len(item_lst))] print('\t>> Scores are saved to {0:s}...'.format(str(file_name))) for i, idx in enumerate(idx_lst): y = y_true y_hat = y_pred if six_db: y = y_true[idx] y_hat = y_pred[idx] y = y.reshape((1, y.shape[0])) y_hat = np.reshape(y_hat, (1, len(y_hat))) save_data(data='*** Scores for {0:s}...\n'.format(str( item_lst[i])), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) else: save_data(data='*** Scores for {0:s}...\n'.format(item_lst[i]), file_name=file_name, save_path=save_path, mode='w', w_string=True, print_tag=False) ce_samples = coverage_error(y, y_hat) save_data( data='\t\t1)- Coverage error score: {0:.4f}\n'.format(ce_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) lrl_samples = label_ranking_loss(y, y_hat) save_data( data='\t\t2)- Ranking loss score: {0:.4f}\n'.format(lrl_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) lrap_samples = label_ranking_average_precision_score(y, y_hat) save_data( data='\t\t3)- Label ranking average precision score: {0:.4f}\n'. format(lrap_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) if not np.array_equal(y_pred, y_pred.astype(bool)): top_k = y_true.shape[1] if top_k > y_true.shape[1] else top_k psp_samples = psp(y_prob=y_hat, y_true=y, A=A, B=B, C=C, top_k=top_k) save_data( data='\t\t4)- Propensity Scored Precision at {0}: {1:.4f}\n'. format(top_k, psp_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) ndcg_samples = psndcg(y_prob=y_hat, y_true=y, A=A, B=B, C=C, top_k=top_k) save_data( data='\t\t5)- Propensity Scored nDCG at {0}: {1:.4f}\n'.format( top_k, ndcg_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) continue hl_samples = hamming_loss(y, y_hat) save_data( data='\t\t4)- Hamming-Loss score: {0:.4f}\n'.format(hl_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) pr_samples_average = precision_score(y, y_hat, average='samples') pr_samples_micro = precision_score(y, y_hat, average='micro') pr_samples_macro = precision_score(y, y_hat, average='macro') save_data(data='\t\t5)- Precision...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Average sample precision: {0:.4f}\n'.format( pr_samples_average), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Micro precision: {0:.4f}\n'.format( pr_samples_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Macro precision: {0:.4f}\n'.format( pr_samples_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) rc_samples_average = recall_score(y, y_hat, average='samples') rc_samples_micro = recall_score(y, y_hat, average='micro') rc_samples_macro = recall_score(y, y_hat, average='macro') save_data(data='\t\t6)- Recall...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Average sample recall: {0:.4f}\n'.format( rc_samples_average), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data( data='\t\t\t--> Micro recall: {0:.4f}\n'.format(rc_samples_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data( data='\t\t\t--> Macro recall: {0:.4f}\n'.format(rc_samples_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) f1_samples_average = f1_score(y, y_hat, average='samples') f1_samples_micro = f1_score(y, y_hat, average='micro') f1_samples_macro = f1_score(y, y_hat, average='macro') save_data(data='\t\t7)- F1-score...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Average sample f1-score: {0:.4f}\n'.format( f1_samples_average), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Micro f1-score: {0:.4f}\n'.format( f1_samples_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Macro f1-score: {0:.4f}\n'.format( f1_samples_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) js_score_samples = jaccard_score(y, y_hat, average='samples') js_score_micro = jaccard_score(y, y_hat, average='micro') js_score_macro = jaccard_score(y, y_hat, average='macro') js_score_weighted = jaccard_score(y, y_hat, average='weighted') save_data(data='\t\t8)- Jaccard score...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (samples): {0:.4f}\n'.format( js_score_samples), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (micro): {0:.4f}\n'.format( js_score_micro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (macro): {0:.4f}\n'.format( js_score_macro), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> Jaccard score (weighted): {0:.4f}\n'.format( js_score_weighted), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) tn, fp, fn, tp = confusion_matrix(y.flatten(), y_hat.flatten()).ravel() save_data(data='\t\t9)- Confusion matrix...\n', file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> True positive: {0}\n'.format(tp), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> True negative: {0}\n'.format(tn), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> False positive: {0}\n'.format(fp), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False) save_data(data='\t\t\t--> False negative: {0}\n'.format(fn), file_name=file_name, save_path=save_path, mode=mode, w_string=True, print_tag=False)
def __train(arg): # Setup the number of operations to employ steps = 1 # Whether to display parameters at every operation display_params = True ########################################################################################################## ###################### PREPROCESSING DATASET ###################### ########################################################################################################## if arg.preprocess_dataset: print('\n{0})- Preprocess dataset...'.format(steps)) steps = steps + 1 print('\t>> Loading files...') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="instances") X = X[:, :arg.cutting_point] # load a biocyc file data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object') ec_dict = data_object["ec_id"] pathway_dict = data_object["pathway_id"] del data_object pathway_dict = dict((idx, id) for id, idx in pathway_dict.items()) ec_dict = dict((idx, id) for id, idx in ec_dict.items()) labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M') print('\t>> Loading label to component mapping file object...') pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath, print_tag=False) pathway2ec_idx = list(pathway2ec_idx) tmp = list(ec_dict.keys()) ec_dict = dict((idx, ec_dict[tmp.index(ec)]) for idx, ec in enumerate(pathway2ec_idx)) # load path2vec features path2vec_features = np.load(file=os.path.join(arg.ospath, arg.features_name)) # load a hin file hin = load_data(file_name=arg.hin_name, load_path=arg.ospath, tag='heterogeneous information network') # get pathway2ec mapping node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)] del hin __build_features(X=X, pathwat_dict=pathway_dict, ec_dict=ec_dict, labels_components=labels_components, node2idx_pathway2ec=node2idx_pathway2ec, path2vec_features=path2vec_features, file_name=arg.file_name, dspath=arg.dspath, batch_size=arg.batch, num_jobs=arg.num_jobs) ########################################################################################################## ###################### TRAIN ###################### ########################################################################################################## if arg.train: print( '\n{0})- Training {1} dataset using leADS model...'.format(steps, arg.X_name)) steps = steps + 1 # load files print('\t>> Loading files...') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X") y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y") y_Bags = None bags_labels = None label_features = None centroids = None if not arg.train_labels: y_Bags = load_data(file_name=arg.yB_name, load_path=arg.dspath, tag="B") bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.ospath, tag="bags_labels with associated pathways") label_features = load_data(file_name=arg.features_name, load_path=arg.ospath, tag="features") centroids = np.load(file=os.path.join(arg.ospath, arg.centroids)) centroids = centroids[centroids.files[0]] A = None if arg.fuse_weight: A = load_item_features(file_name=os.path.join(arg.ospath, arg.similarity_name), use_components=False) if arg.train_selected_sample: if os.path.exists(os.path.join(arg.rspath, arg.samples_ids)): sample_ids = load_data(file_name=arg.samples_ids, load_path=arg.rspath, tag="selected samples") sample_ids = np.array(sample_ids) X = X[sample_ids, :] y = y[sample_ids, :] if not arg.train_labels: y_Bags = y_Bags[sample_ids, :] else: print('\t\t No sample ids file is provided...') model = leADS(alpha=arg.alpha, binarize_input_feature=arg.binarize_input_feature, normalize_input_feature=arg.normalize_input_feature, use_external_features=arg.use_external_features, cutting_point=arg.cutting_point, fit_intercept=arg.fit_intercept, decision_threshold=arg.decision_threshold, subsample_input_size=arg.ssample_input_size, subsample_labels_size=arg.ssample_label_size, calc_ads=arg.calc_ads, acquisition_type=arg.acquisition_type, top_k=arg.top_k, ads_percent=arg.ads_percent, advanced_subsampling=arg.advanced_subsampling, tol_labels_iter=arg.tol_labels_iter, cost_subsample_size=arg.calc_subsample_size, calc_label_cost=arg.calc_label_cost, calc_bag_cost=arg.calc_bag_cost, calc_total_cost=arg.calc_total_cost, label_uncertainty_type=arg.label_uncertainty_type, label_bag_sim=arg.label_bag_sim, label_closeness_sim=arg.label_closeness_sim, corr_bag_sim=arg.corr_bag_sim, corr_label_sim=arg.corr_label_sim, corr_input_sim=arg.corr_input_sim, penalty=arg.penalty, alpha_elastic=arg.alpha_elastic, l1_ratio=arg.l1_ratio, sigma=arg.sigma, fuse_weight=arg.fuse_weight, lambdas=arg.lambdas, loss_threshold=arg.loss_threshold, early_stop=arg.early_stop, learning_type=arg.learning_type, lr=arg.lr, lr0=arg.lr0, delay_factor=arg.delay_factor, forgetting_rate=arg.forgetting_rate, num_models=arg.num_models, batch=arg.batch, max_inner_iter=arg.max_inner_iter, num_epochs=arg.num_epochs, num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle, random_state=arg.random_state, log_path=arg.logpath) model.fit(X=X, y=y, y_Bag=y_Bags, bags_labels=bags_labels, label_features=label_features, centroids=centroids, A=A, model_name=arg.model_name, model_path=arg.mdpath, result_path=arg.rspath, display_params=display_params) ########################################################################################################## ###################### EVALUATE ###################### ########################################################################################################## if arg.evaluate: print('\n{0})- Evaluating leADS model...'.format(steps)) steps = steps + 1 # load files print('\t>> Loading files...') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X") bags_labels = None label_features = None centroids = None if not arg.pred_bags: y = load_data(file_name=arg.y_name, load_path=arg.dspath, tag="y") if arg.pred_bags: y_Bags = load_data(file_name=arg.yB_name, load_path=arg.dspath, tag="B") # load model model = load_data(file_name=arg.model_name + '.pkl', load_path=arg.mdpath, tag='leADS') if model.learn_bags: bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.dspath, tag="bags_labels with associated pathways") if model.label_uncertainty_type == "dependent": label_features = load_data(file_name=arg.features_name, load_path=arg.dspath, tag="features") centroids = np.load(file=os.path.join(arg.dspath, arg.centroids)) centroids = centroids[centroids.files[0]] # labels prediction score y_pred_Bags, y_pred = model.predict(X=X, bags_labels=bags_labels, label_features=label_features, centroids=centroids, estimate_prob=arg.estimate_prob, pred_bags=arg.pred_bags, pred_labels=arg.pred_labels, build_up=arg.build_up, pref_rank=arg.pref_rank, top_k_rank=arg.top_k_rank, subsample_labels_size=arg.ssample_label_size, soft_voting=arg.soft_voting, apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold, batch_size=arg.batch, num_jobs=arg.num_jobs) file_name = arg.file_name + '_scores.txt' if arg.pred_bags: score(y_true=y_Bags.toarray(), y_pred=y_pred_Bags.toarray(), item_lst=['biocyc_bags'], six_db=False, top_k=arg.top_k, mode='a', file_name=file_name, save_path=arg.rspath) if arg.pred_labels: if arg.dsname == 'golden': score(y_true=y.toarray(), y_pred=y_pred.toarray(), item_lst=[arg.dsname], six_db=True, top_k=arg.top_k, mode='a', file_name=file_name, save_path=arg.rspath) else: score(y_true=y.toarray(), y_pred=y_pred.toarray(), item_lst=[arg.dsname], six_db=False, top_k=arg.top_k, mode='a', file_name=file_name, save_path=arg.rspath) ########################################################################################################## ###################### PREDICT ###################### ########################################################################################################## if arg.predict: print('\n{0})- Predicting dataset using a pre-trained leADS model...'.format(steps)) if arg.pathway_report or arg.extract_pf: print('\t>> Loading biocyc object...') # load a biocyc file data_object = load_data(file_name=arg.object_name, load_path=arg.ospath, tag='the biocyc object', print_tag=False) pathway_dict = data_object["pathway_id"] pathway_common_names = dict((pidx, data_object['processed_kb']['metacyc'][5][pid][0][1]) for pid, pidx in pathway_dict.items() if pid in data_object['processed_kb']['metacyc'][5]) ec_dict = data_object['ec_id'] del data_object pathway_dict = dict((idx, id) for id, idx in pathway_dict.items()) ec_dict = dict((idx, id) for id, idx in ec_dict.items()) labels_components = load_data(file_name=arg.pathway2ec_name, load_path=arg.ospath, tag='M') print('\t>> Loading label to component mapping file object...') pathway2ec_idx = load_data(file_name=arg.pathway2ec_idx_name, load_path=arg.ospath, print_tag=False) pathway2ec_idx = list(pathway2ec_idx) tmp = list(ec_dict.keys()) ec_dict = dict((idx, ec_dict[tmp.index(ec)]) for idx, ec in enumerate(pathway2ec_idx)) if arg.extract_pf: X, sample_ids = parse_files(ec_dict=ec_dict, ds_folder=arg.dsfolder, dspath=arg.dspath, rspath=arg.rspath, num_jobs=arg.num_jobs) print('\t>> Storing X and sample_ids...') save_data(data=X, file_name=arg.file_name + '_X.pkl', save_path=arg.dspath, tag='the pf dataset (X)', mode='w+b', print_tag=False) save_data(data=sample_ids, file_name=arg.file_name + '_ids.pkl', save_path=arg.dspath, tag='samples ids', mode='w+b', print_tag=False) print('\t>> Loading heterogeneous information network file...') hin = load_data(file_name=arg.hin_name, load_path=arg.ospath, tag='heterogeneous information network', print_tag=False) # get pathway2ec mapping node2idx_pathway2ec = [node[0] for node in hin.nodes(data=True)] del hin print('\t>> Loading path2vec_features file...') path2vec_features = np.load(file=os.path.join(arg.ospath, arg.features_name)) __build_features(X=X, pathwat_dict=pathway_dict, ec_dict=ec_dict, labels_components=labels_components, node2idx_pathway2ec=node2idx_pathway2ec, path2vec_features=path2vec_features, file_name=arg.file_name, dspath=arg.dspath, batch_size=arg.batch, num_jobs=arg.num_jobs) # load files print('\t>> Loading necessary files......') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X") tmp = lil_matrix.copy(X) bags_labels = None label_features = None centroids = None # load model model = load_data(file_name=arg.model_name + '.pkl', load_path=arg.mdpath, tag='leADS') if model.learn_bags: bags_labels = load_data(file_name=arg.bags_labels, load_path=arg.ospath, tag="bags_labels with associated pathways") if model.label_uncertainty_type == "dependent": label_features = load_data(file_name=arg.features_name, load_path=arg.ospath, tag="features") centroids = np.load(file=os.path.join(arg.ospath, arg.centroids)) centroids = centroids[centroids.files[0]] # predict y_pred_Bags, y_pred = model.predict(X=X, bags_labels=bags_labels, label_features=label_features, centroids=centroids, estimate_prob=False, pred_bags=arg.pred_bags, pred_labels=arg.pred_labels, build_up=arg.build_up, pref_rank=arg.pref_rank, top_k_rank=arg.top_k_rank, subsample_labels_size=arg.ssample_label_size, soft_voting=arg.soft_voting, apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold, batch_size=arg.batch, num_jobs=arg.num_jobs) # labels prediction score y_pred_Bags_score, y_pred_score = model.predict(X=X, bags_labels=bags_labels, label_features=label_features, centroids=centroids, estimate_prob=True, pred_bags=arg.pred_bags, pred_labels=arg.pred_labels, build_up=arg.build_up, pref_rank=arg.pref_rank, top_k_rank=arg.top_k_rank, subsample_labels_size=arg.ssample_label_size, soft_voting=arg.soft_voting, apply_t_criterion=arg.apply_tcriterion, adaptive_beta=arg.adaptive_beta, decision_threshold=arg.decision_threshold, batch_size=arg.batch, num_jobs=arg.num_jobs) if arg.pathway_report: print('\t>> Synthesizing pathway reports...') X = tmp sample_ids = np.arange(X.shape[0]) if arg.extract_pf: sample_ids = load_data(file_name=arg.file_name + "_ids.pkl", load_path=arg.dspath, tag="samples ids") else: if arg.samples_ids is not None: if arg.samples_ids in os.listdir(arg.dspath): sample_ids = load_data(file_name=arg.samples_ids, load_path=arg.dspath, tag="samples ids") synthesize_report(X=X[:, :arg.cutting_point], sample_ids=sample_ids, y_pred=y_pred, y_dict_ids=pathway_dict, y_common_name=pathway_common_names, component_dict=ec_dict, labels_components=labels_components, y_pred_score=y_pred_score, batch_size=arg.batch, num_jobs=arg.num_jobs, rspath=arg.rspath, dspath=arg.dspath, file_name=arg.file_name) else: print('\t>> Storing predictions (label index) to: {0:s}'.format(arg.file_name + '_y_leads.pkl')) save_data(data=y_pred, file_name=arg.file_name + "_y_leads.pkl", save_path=arg.dspath, mode="wb", print_tag=False) if arg.pred_bags: print('\t>> Storing predictions (bag index) to: {0:s}'.format( arg.file_name + '_yBags_leads.pkl')) save_data(data=y_pred_Bags, file_name=arg.file_name + "_yBags_leads.pkl", save_path=arg.dspath, mode="wb", print_tag=False)
def __fit_by_tf(self, X, node_id, node_probability, index2type, type2index, type2prob, model_name, model_path, result_path): ## Build layers for path2vec print('\t>> Building: path2vec layers...') logger.info('\t>> Building: path2vec layers...') timeref = time.time() center_node_holder, context_node_holder, negative_samples_holder, loss = self.__build_tf_place_holders( node_probability=node_probability) ## Optimization function for path2vec optimizer = self.__optimizer(center_node_holder, context_node_holder, negative_samples_holder) print('\t\t## Building layers consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3))) logger.info('\t\t## Building layers consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3))) print('\t>> Training path2vec...') logger.info('\t>> Training path2vec...') old_cost = np.inf timeref = time.time() cost_file_name = model_name + "_cost.txt" save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False) merged = tf.summary.merge_all() saver = tf.train.Saver(max_to_keep=self.num_models) config = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0, allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(self.log_path, sess.graph) # Define metadata variable. run_metadata = tf.RunMetadata() for epoch in np.arange(start=1, stop=self.num_epochs + 1): desc = '\t {0:d})- Epoch count ({0:d}/{1:d})...'.format(epoch, self.num_epochs) print(desc) logger.info(desc) self.__shffule(X=X) list_batches = np.arange(start=0, stop=len(X), step=self.batch) epoch_timeref = time.time() new_cost = 0.0 for idx, batch in enumerate(list_batches): total_samples = (idx + 1) / len(list_batches) desc = '\t --> Learning: {0:.4f}% ...'.format(total_samples * 100) logger.info(desc) if (idx + 1) != len(list_batches): print(desc, end="\r") if (idx + 1) == len(list_batches): print(desc) ## Generate batch negative samples center_nodes, context_nodes = self.__generate_batch(X=X[batch:batch + self.batch]) negative_nodes = self.__get_negative_samples(center_nodes=center_nodes, node_id=node_id, node_probability=node_probability, index2type=index2type, type2index=type2index, type2probs=type2prob) batch_X_size = self.batch if self.batch > 150000: batch_X_size = 10000 list_batch_X = np.arange(start=0, stop=center_nodes.shape[0], step=batch_X_size) for b_idx, batch_X_idx in enumerate(list_batch_X): center_batch = center_nodes[batch_X_idx:batch_X_idx + batch_X_size] context_batch = context_nodes[batch_X_idx:batch_X_idx + batch_X_size] negative_batch = negative_nodes[batch_X_idx:batch_X_idx + batch_X_size] for inner_iterations in np.arange(self.max_inner_iter): feed_dict = {center_node_holder: center_batch, context_node_holder: context_batch, negative_samples_holder: negative_batch} # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() # Also, evaluate the merged op to get all summaries from the returned # "summary" variable. Feed metadata variable to session for visualizing # the graph in TensorBoard. loss_batch, _, summary_str = sess.run([loss, optimizer, merged], feed_dict=feed_dict, run_metadata=run_metadata) writer.add_summary(summary_str, inner_iterations) loss_batch /= center_batch.shape[0] new_cost += loss_batch / self.max_inner_iter new_cost /= len(list_batch_X) new_cost /= len(list_batches) new_cost = new_cost * -1 self.is_fit = True print('\t\t ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3))) logger.info( '\t\t ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3))) data = str(epoch) + '\t' + str(round(time.time() - epoch_timeref, 3)) + '\t' + str(new_cost) + '\n' save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True, print_tag=False) # Save models parameters based on test frequencies if (epoch % self.display_interval) == 0 or epoch == 1 or epoch == self.num_epochs: print('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost)) logger.info('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost)) if new_cost < old_cost or epoch == self.num_epochs: old_cost = new_cost tag_final_file = "_tf.ckpt" tag_final_embeddings = "_tf_embeddings.npz" if epoch == self.num_epochs: tag_final_file = "_final_tf.ckpt" tag_final_embeddings = "_final_tf_embeddings.npz" print('\t\t --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file)) logger.info( '\t\t --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file)) saver.save(sess, os.path.join(model_path, model_name + tag_final_file)) print('\t\t --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format( model_name + tag_final_embeddings)) logger.info('\t\t --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format( model_name + tag_final_embeddings)) model_embeddings = tf.get_default_graph() model_embeddings = model_embeddings.get_tensor_by_name("embeddings/embedding_matrix:0") # Create a configuration for visualizing embeddings with the selected_pathways in TensorBoard. # TODO: comment this config = projector.ProjectorConfig() embedding_conf = config.embeddings.add() embedding_conf.tensor_name = model_embeddings.name ## model_embeddings = sess.run(model_embeddings) np.savez(os.path.join(model_path, model_name + tag_final_embeddings), model_embeddings) # TODO: comment this embedding_conf.metadata_path = os.path.join(model_path, model_name + '_metadata.tsv') projector.visualize_embeddings(writer, config) writer.close() print('\t --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3))) logger.info('\t --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
def __fit_by_word2vec(self, X, type2index, model_name, model_path, result_path): ''' Learn embeddings by optimizing the Skipgram objective using SGD. ''' old_cost = np.inf timeref = time.time() cost_file_name = model_name + "_word2vec_cost.txt" save_data('', file_name=cost_file_name, save_path=result_path, mode='w', w_string=True, print_tag=False) print('\t>> Training by word2vec model...') logger.info('\t>> Training by word2vec model...') model = word2vec.Word2Vec(size=self.embedding_dimension, window=self.window_size, min_count=0, sg=1, workers=self.num_jobs, negative=self.num_negative_samples, compute_loss=True) print('\t>> Building vocabulary...') logger.info('\t>> Building vocabulary...') model.build_vocab(X) n_epochs = self.num_epochs + 1 if self.constraint_type: n_epochs = self.num_epochs + 2 node_type = [t for t, nodes in type2index.items()] list_type = list() for items, t in enumerate(node_type): list_type.append([str(node) for node in type2index[t] if str(node) in model]) for epoch in np.arange(start=1, stop=n_epochs): desc = '\t {0:d})- Epoch count ({0:d}/{1:d})...'.format(epoch, n_epochs - 1) print(desc) logger.info(desc) self.__shffule(X=X) list_batches = np.arange(start=0, stop=len(X), step=self.batch) epoch_timeref = time.time() new_cost = 0.0 for idx, batch in enumerate(list_batches): desc = '\t --> Learning: {0:.2f}% ...'.format(((idx + 1) / len(list_batches)) * 100) logger.info(desc) if (idx + 1) != len(list_batches): print(desc, end="\r") if (idx + 1) == len(list_batches): print(desc) model.train(X[batch:batch + self.batch], total_examples=len(X[batch:batch + self.batch]), epochs=self.max_inner_iter, compute_loss=True) if self.constraint_type: for items in list_type: emb = model[items] denominator = np.sum(np.triu(np.dot(emb, emb.T), 1)) emb = emb / denominator for i, node in enumerate(items): model.wv.syn0[model.wv.vocab[node].index] = emb[i] new_cost += model.get_latest_training_loss() / len(list_batches) new_cost /= self.max_inner_iter if self.constraint_type and epoch == 1: continue self.is_fit = True print('\t\t ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3))) logger.info( '\t\t ## Epoch {0} took {1} seconds...'.format(epoch, round(time.time() - epoch_timeref, 3))) data = str(epoch) + '\t' + str(round(time.time() - epoch_timeref, 3)) + '\t' + str(new_cost) + '\n' save_data(data=data, file_name=cost_file_name, save_path=result_path, mode='a', w_string=True, print_tag=False) # Save models parameters based on test frequencies if (epoch % self.display_interval) == 0 or epoch == 1 or epoch == n_epochs - 1: print('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost)) logger.info('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format(new_cost, old_cost)) if new_cost < old_cost or epoch == n_epochs - 1: old_cost = new_cost tag_final_file = "_word2vec.ckpt" tag_final_embeddings = "_word2vec_embeddings.npz" if epoch == n_epochs - 1: tag_final_file = "_final_word2vec.ckpt" tag_final_embeddings = "_final_word2vec_embeddings.npz" print('\t\t --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file)) logger.info('\t\t --> Storing the path2vec model to: {0:s}'.format(model_name + tag_final_file)) model.wv.save_word2vec_format(os.path.join(model_path, model_name + tag_final_file)) print('\t\t --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format( model_name + tag_final_embeddings)) logger.info('\t\t --> Storing the path2vec node embeddings as numpy array to: {0:s}'.format( model_name + tag_final_embeddings)) model_embeddings = np.zeros((self.node_size, self.embedding_dimension), dtype=np.float32) for v_idx in np.arange(self.node_size): if str(v_idx) in model.wv.vocab: model_embeddings[v_idx] = model[str(v_idx)] np.savez(os.path.join(model_path, model_name + tag_final_embeddings), model_embeddings) print('\t --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3))) logger.info('\t --> Training consumed %.2f mintues' % (round((time.time() - timeref) / 60., 3)))
def __train(arg): # Setup the number of operations to employ steps = 1 # Whether to display parameters at every operation display_params = True ########################################################################################################## ###################### TRAIN ###################### ########################################################################################################## if arg.train: print('\t>> Loading files...') dictionary = load_data(file_name=arg.vocab_name, load_path=arg.dspath, tag="dictionary", print_tag=False) X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X", print_tag=False) M = None features = None if arg.use_supplement: M = load_data(file_name=arg.M_name, load_path=arg.dspath, tag="supplementary components") M = M.toarray() if arg.use_features: features = load_data(file_name=arg.features_name, load_path=arg.dspath, tag="features") if arg.soap: print('\n{0})- Training using SOAP model...'.format(steps)) steps = steps + 1 model_name = 'soap_' + arg.model_name model = SOAP(vocab=dictionary.token2id, num_components=arg.num_components, alpha_mu=arg.alpha_mu, alpha_sigma=arg.alpha_sigma, alpha_phi=arg.alpha_phi, gamma=arg.gamma, kappa=arg.kappa, xi=arg.xi, varpi=arg.varpi, optimization_method=arg.opt_method, cost_threshold=arg.cost_threshold, component_threshold=arg.component_threshold, max_sampling=arg.max_sampling, subsample_input_size=arg.subsample_input_size, batch=arg.batch, num_epochs=arg.num_epochs, max_inner_iter=arg.max_inner_iter, top_k=arg.top_k, collapse2ctm=arg.collapse2ctm, use_features=arg.use_features, num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle, forgetting_rate=arg.forgetting_rate, delay_factor=arg.delay_factor, random_state=arg.random_state, log_path=arg.logpath) model.fit(X=X, M=M, features=features, model_name=model_name, model_path=arg.mdpath, result_path=arg.rspath, display_params=display_params) if arg.spreat: print('\n{0})- Training using SPREAT model...'.format(steps)) steps = steps + 1 model_name = 'spreat_' + arg.model_name model = SPREAT(vocab=dictionary.token2id, num_components=arg.num_components, alpha_mu=arg.alpha_mu, alpha_sigma=arg.alpha_sigma, alpha_phi=arg.alpha_phi, gamma=arg.gamma, kappa=arg.kappa, xi=arg.xi, varpi=arg.varpi, optimization_method=arg.opt_method, cost_threshold=arg.cost_threshold, component_threshold=arg.component_threshold, max_sampling=arg.max_sampling, subsample_input_size=arg.subsample_input_size, batch=arg.batch, num_epochs=arg.num_epochs, max_inner_iter=arg.max_inner_iter, top_k=arg.top_k, collapse2ctm=arg.collapse2ctm, use_features=arg.use_features, num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle, forgetting_rate=arg.forgetting_rate, delay_factor=arg.delay_factor, random_state=arg.random_state, log_path=arg.logpath) model.fit(X=X, M=M, features=features, model_name=model_name, model_path=arg.mdpath, result_path=arg.rspath, display_params=display_params) if arg.ctm: print('\n{0})- Training using CMT model...'.format(steps)) steps = steps + 1 model_name = 'ctm_' + arg.model_name model = CTM(vocab=dictionary.token2id, num_components=arg.num_components, alpha_mu=arg.alpha_mu, alpha_sigma=arg.alpha_sigma, alpha_beta=arg.alpha_phi, optimization_method=arg.opt_method, cost_threshold=arg.cost_threshold, component_threshold=arg.component_threshold, subsample_input_size=arg.subsample_input_size, batch=arg.batch, num_epochs=arg.num_epochs, max_inner_iter=arg.max_inner_iter, num_jobs=arg.num_jobs, display_interval=arg.display_interval, shuffle=arg.shuffle, forgetting_rate=arg.forgetting_rate, delay_factor=arg.delay_factor, random_state=arg.random_state, log_path=arg.logpath) model.fit(X=X, model_name=model_name, model_path=arg.mdpath, result_path=arg.rspath, display_params=display_params) if arg.lda: print( '\n{0})- Training using LDA (sklearn) model...'.format(steps)) steps = steps + 1 model_name = 'sklda_' + arg.model_name model = skLDA(n_components=arg.num_components, learning_method='batch', learning_decay=arg.delay_factor, learning_offset=arg.forgetting_rate, max_iter=1, batch_size=arg.batch, evaluate_every=arg.display_interval, perp_tol=arg.cost_threshold, mean_change_tol=arg.component_threshold, max_doc_update_iter=arg.max_inner_iter, n_jobs=arg.num_jobs, verbose=0, random_state=arg.random_state) print('\t>> Training by LDA model...') n_epochs = arg.num_epochs + 1 old_bound = np.inf num_samples = int(X.shape[0] * arg.subsample_input_size) list_batches = np.arange(start=0, stop=num_samples, step=arg.batch) cost_file_name = model_name + "_cost.txt" save_data('', file_name=cost_file_name, save_path=arg.rspath, mode='w', w_string=True, print_tag=False) for epoch in np.arange(start=1, stop=n_epochs): desc = '\t {0:d})- Epoch count ({0:d}/{1:d})...'.format( epoch, n_epochs - 1) print(desc) idx = np.random.choice(X.shape[0], num_samples, False) start_epoch = time.time() X_tmp = X[idx, :] for bidx, batch in enumerate(list_batches): desc = '\t --> Training: {0:.2f}%...'.format( ((bidx + 1) / len(list_batches)) * 100) if (bidx + 1) != len(list_batches): print(desc, end="\r") if (bidx + 1) == len(list_batches): print(desc) model.partial_fit(X=X_tmp[batch:batch + arg.batch]) end_epoch = time.time() new_bound = -model.score(X=X_tmp) / X.shape[1] new_bound = np.log(new_bound) print('\t\t ## Epoch {0} took {1} seconds...'.format( epoch, round(end_epoch - start_epoch, 3))) data = str(epoch) + '\t' + str( round(end_epoch - start_epoch, 3)) + '\t' + str(new_bound) + '\n' save_data(data=data, file_name=cost_file_name, save_path=arg.rspath, mode='a', w_string=True, print_tag=False) print('\t\t --> New cost: {0:.4f}; Old cost: {1:.4f}'.format( new_bound, old_bound)) if new_bound <= old_bound or epoch == n_epochs - 1: print('\t\t --> Storing the LDA phi to: {0:s}'.format( model_name + '_phi.npz')) np.savez(os.path.join(arg.mdpath, model_name + '_phi.npz'), model.components_) print( '\t\t --> Storing the LDA (sklearn) model to: {0:s}'. format(model_name + '.pkl')) save_data(data=model, file_name=model_name + '.pkl', save_path=arg.mdpath, mode="wb", print_tag=False) if epoch == n_epochs - 1: print('\t\t --> Storing the LDA phi to: {0:s}'.format( model_name + '_phi_final.npz')) np.savez( os.path.join(arg.mdpath, model_name + '_phi_final.npz'), model.components_) print( '\t\t --> Storing the LDA (sklearn) model to: {0:s}' .format(model_name + '_final.pkl')) save_data(data=model, file_name=model_name + '_final.pkl', save_path=arg.mdpath, mode="wb", print_tag=False) old_bound = new_bound display_params = False ########################################################################################################## ###################### EVALUATE ###################### ########################################################################################################## if arg.evaluate: print('\t>> Loading files...') dictionary = load_data(file_name=arg.vocab_name, load_path=arg.dspath, tag="vocabulary", print_tag=False) X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X", print_tag=False) corpus = load_data(file_name=arg.text_name, load_path=arg.dspath, tag="X (a list of strings)", print_tag=False) data = [[dictionary[i] for i, j in item] for item in corpus] M = None features = None if arg.use_supplement: M = load_data(file_name=arg.M_name, load_path=arg.dspath, tag="supplementary components") M = M.toarray() if arg.use_features: features = load_data(file_name=arg.features_name, load_path=arg.dspath, tag="features") if arg.soap: print('\n{0})- Evaluating SOAP model...'.format(steps)) steps = steps + 1 model_name = 'soap_' + arg.model_name + '.pkl' file_name = 'soap_' + arg.model_name + '_score.txt' print('\t>> Loading SOAP model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='SOAP model', print_tag=False) score = model.predictive_distribution(X=X, M=M, features=features, cal_average=arg.cal_average, batch_size=arg.batch, num_jobs=arg.num_jobs) print("\t>> Average log predictive score: {0:.4f}".format(score)) save_data(data="# Average log predictive score: {0:.10f}\n".format( score), file_name=file_name, save_path=arg.rspath, tag="log predictive score", mode='w', w_string=True, print_tag=False) components = np.argsort(-model.phi)[:, :arg.top_k] components = [[dictionary[i] for i in item] for item in components] for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']: cm = CoherenceModel(texts=data, topics=components, corpus=corpus, dictionary=dictionary, coherence=cr) coherence = cm.get_coherence() print("\t>> Average coherence ({0}) score: {1:.4f}".format( cr, coherence)) save_data( data="# Average coherence ({0}) score: {1:.4f}\n".format( cr, coherence), file_name=file_name, save_path=arg.rspath, tag="coherence score", mode='a', w_string=True, print_tag=False) if arg.spreat: print('\n{0})- Evaluating SPREAT model...'.format(steps)) steps = steps + 1 model_name = 'spreat_' + arg.model_name + '.pkl' file_name = 'spreat_' + arg.model_name + '_score.txt' print('\t>> Loading SPREAT model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='SPREAT model', print_tag=False) score = model.predictive_distribution(X=X, M=M, features=features, cal_average=arg.cal_average, batch_size=arg.batch, num_jobs=arg.num_jobs) print("\t>> Average log predictive score: {0:.4f}".format(score)) save_data(data="# Average log predictive score: {0:.10f}\n".format( score), file_name=file_name, save_path=arg.rspath, tag="log predictive score", mode='w', w_string=True, print_tag=False) components = np.argsort(-model.phi)[:, :arg.top_k] components = [[dictionary[i] for i in item] for item in components] for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']: cm = CoherenceModel(texts=data, topics=components, corpus=corpus, dictionary=dictionary, coherence=cr) coherence = cm.get_coherence() print("\t>> Average coherence ({0}) score: {1:.4f}".format( cr, coherence)) save_data( data="# Average coherence ({0}) score: {1:.4f}\n".format( cr, coherence), file_name=file_name, save_path=arg.rspath, tag="coherence score", mode='a', w_string=True, print_tag=False) if arg.ctm: print('\n{0})- Evaluating CTM model...'.format(steps)) steps = steps + 1 model_name = 'ctm_' + arg.model_name + '.pkl' file_name = 'ctm_' + arg.model_name + '_score.txt' print('\t>> Loading CTM model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='CTM model', print_tag=False) score = model.predictive_distribution(X=X, cal_average=arg.cal_average, batch_size=arg.batch, num_jobs=arg.num_jobs) print("\t>> Average log predictive score: {0:.4f}".format(score)) save_data(data="# Average log predictive score: {0:.10f}\n".format( score), file_name=file_name, save_path=arg.rspath, tag="log predictive score", mode='w', w_string=True, print_tag=False) components = np.argsort(-model.omega)[:, :arg.top_k] components = [[dictionary[i] for i in item] for item in components] for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']: cm = CoherenceModel(texts=data, topics=components, corpus=corpus, dictionary=dictionary, coherence=cr) coherence = cm.get_coherence() print("\t>> Average coherence ({0}) score: {1:.4f}".format( cr, coherence)) save_data( data="# Average coherence ({0}) score: {1:.4f}\n".format( cr, coherence), file_name=file_name, save_path=arg.rspath, tag="coherence score", mode='a', w_string=True, print_tag=False) if arg.lda: print('\n{0})- Evaluating LDA model...'.format(steps)) steps = steps + 1 model_name = 'sklda_' + arg.model_name + '.pkl' file_name = 'sklda_' + arg.model_name + '_score.txt' print('\t>> Loading LDA model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='LDA model', print_tag=False) model.components_ /= model.components_.sum(1)[:, np.newaxis] component_distribution = model.transform(X=X) score = 0.0 for idx in np.arange(X.shape[0]): feature_idx = X[idx].indices temp = np.multiply(component_distribution[idx][:, np.newaxis], model.components_[:, feature_idx]) score += np.sum(temp) if arg.cal_average: score = score / X.shape[0] score = np.log(score + np.finfo(np.float).eps) print("\t>> Average log predictive score: {0:.4f}".format(score)) save_data(data="# Average log predictive score: {0:.10f}\n".format( score), file_name=file_name, save_path=arg.rspath, tag="log predictive score", mode='w', w_string=True, print_tag=False) components = np.argsort(-model.components_)[:, :arg.top_k] components = [[dictionary[i] for i in item] for item in components] for cr in ['u_mass', 'c_v', 'c_uci', 'c_npmi']: cm = CoherenceModel(texts=data, topics=components, corpus=corpus, dictionary=dictionary, coherence=cr) coherence = cm.get_coherence() print("\t>> Average coherence ({0}) score: {1:.4f}".format( cr, coherence)) save_data( data="# Average coherence ({0}) score: {1:.4f}\n".format( cr, coherence), file_name=file_name, save_path=arg.rspath, tag="coherence score", mode='a', w_string=True, print_tag=False) ########################################################################################################## ###################### TRANSFORM ###################### ########################################################################################################## if arg.transform: print('\t>> Loading files...') X = load_data(file_name=arg.X_name, load_path=arg.dspath, tag="X", print_tag=False) M = None features = None if arg.use_supplement: M = load_data(file_name=arg.M_name, load_path=arg.dspath, tag="supplementary components") M = M.toarray() if arg.use_features: features = load_data(file_name=arg.features_name, load_path=arg.dspath, tag="features") if arg.soap: print('\n{0})- Transforming {1} using a pre-trained SOAP model...'. format(steps, arg.X_name)) steps = steps + 1 model_name = 'soap_' + arg.model_name + '.pkl' file_name = 'soap_' + arg.file_name + '.pkl' print('\t>> Loading SOAP model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='SOAP model', print_tag=False) X = model.transform(X=X, M=M, features=features, batch_size=arg.batch, num_jobs=arg.num_jobs) save_data(data=X, file_name=file_name, save_path=arg.dspath, tag="transformed X", mode='wb', print_tag=True) if arg.spreat: print( '\n{0})- Transforming {1} using a pre-trained SPREAT model...'. format(steps, arg.X_name)) steps = steps + 1 model_name = 'spreat_' + arg.model_name + '.pkl' file_name = 'spreat_' + arg.file_name + '.pkl' print('\t>> Loading SPREAT model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='SPREAT model', print_tag=False) X = model.transform(X=X, M=M, features=features, batch_size=arg.batch, num_jobs=arg.num_jobs) save_data(data=X, file_name=file_name, save_path=arg.dspath, tag="transformed X", mode='wb', print_tag=True) if arg.ctm: print('\n{0})- Transforming {1} using a pre-trained CTM model...'. format(steps, arg.X_name)) steps = steps + 1 model_name = 'ctm_' + arg.model_name + '.pkl' file_name = 'ctm_' + arg.file_name + '.pkl' print('\t>> Loading CTM model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='CTM model', print_tag=False) X = model.transform(X=X, batch_size=arg.batch, num_jobs=arg.num_jobs) save_data(data=X, file_name=file_name, save_path=arg.dspath, tag="transformed X", mode='wb', print_tag=True) if arg.lda: print('\n{0})- Transforming {1} using a pre-trained LDA model...'. format(steps, arg.X_name)) steps = steps + 1 model_name = 'sklda_' + arg.model_name + '.pkl' file_name = 'sklda_' + arg.file_name + '.pkl' print('\t>> Loading LDA model...') model = load_data(file_name=model_name, load_path=arg.mdpath, tag='LDA model', print_tag=False) X = model.transform(X=X) save_data(data=X, file_name=file_name, save_path=arg.dspath, tag="transformed X", mode='wb', print_tag=True)