def collect_data(file_path): os.chdir(file_path) extension = 'csv' all_files = [ i for i in glob.glob('split_*_classification_results.{}'.format( extension)) ] print(all_files) attributes_calculator = features.FeatureConstructor() ordered_attributes_list = list(attributes_calculator.attributes_map.keys()) col = ["index"] for i in ordered_attributes_list: col.append(i) m = np.array([[c + m for m in ["Precision", "Recall", "F1", "AUC"]] for c in ["RF_", "SVM_", "KNN_"]]).flatten() for i in m: col.append(i) print(ordered_attributes_list) df = pd.concat([ pd.read_csv(f, sep=",", index_col=0, skiprows=1, names=col) for f in all_files ]) df = df.reset_index() df = df[col[1:]] #print(df) #df["sum_of_neighbors"].hist(bins=100) return df
def calculate_classification_dataset(self): """ Calculates the attributes for each example of the sample, and returns it as a matrix ready for applying the classification algorithms, in order to perform the link prediction. """ attributes_calculator = features.FeatureConstructor( self.graph_training) if self.attributes_list == {}: self.ordered_attributes_list = sorted( attributes_calculator.attributes_map.keys()) for attribute in self.ordered_attributes_list: self.attributes_list[attribute] = {} classification_dataset = np.zeros( (self.sample_size, len(self.attributes_list) + 2)) line_count = 0 for line in self.sample_dataset: first_node, second_node, pair_class, pair_fold = line attributes_calculator.set_nodes(first_node, second_node) column = 0 for function in self.ordered_attributes_list: parameters = self.attributes_list[function] classification_dataset[line_count][ column] = attributes_calculator.attributes_map[function]( **parameters) column += 1 classification_dataset[line_count][-2] = pair_class classification_dataset[line_count][-1] = pair_fold line_count += 1 return classification_dataset
def get_features_inner(self,inp): s=inp[0] p_edges=inp[1] if not os.path.exists(f'results/{args.f}'): os.makedirs(f'results/{args.f}') #os.makedirs(f'results/{args.f}/positive') f=open(f'results/{args.f}/split_{s}_features.csv', 'w+') f.close() #f=open(f'results/{args.f}/positive/split_{s}_features.csv', 'w+') #f.close() attributes_calculator = features.FeatureConstructor(self.train_graph,self.page_rank) attributes_list={} if attributes_list == {}: ordered_attributes_list = attributes_calculator.attributes_map.keys() for attribute in ordered_attributes_list: attributes_list[attribute] = {} line = 0 for pair in self.test_edges: n1, n2 = pair attributes_calculator.set_nodes(n1, n2) column_values=np.zeros(len(ordered_attributes_list)+1) fet=attributes_calculator.get_features(pair) column_values[:-1]=fet #column_values[-3] = n1 #column_values[-2] = n2 column_values[-1] = 1#self.p_label[pair] line += 1 with open(f'results/{args.f}/split_{s}_features.csv', 'a+') as file: np.savetxt(file, [column_values], delimiter=",",fmt='%f') file.close() c=0 for pair in p_edges: if pair in self.test_edges: continue elif pair in self.train_edges: continue else: n1, n2 = pair attributes_calculator.set_nodes(n1, n2) column_values=np.zeros(len(ordered_attributes_list)+1) fet=attributes_calculator.get_features(pair) column_values[:-1]=fet #column_values[-3] = n1 #column_values[-2] = n2 column_values[-1] = 0#self.p_label[pair] line += 1 with open(f'results/{args.f}/split_{s}_features.csv', 'a+') as file: np.savetxt(file, [column_values], delimiter=",",fmt='%f') file.close() c += 1 if c>=len(self.test_edges): break return 1
def collect_data(file_path): os.chdir(file_path) extension = 'csv' all_files = [i for i in glob.glob('*_features.{}'.format(extension))] print(all_files) attributes_calculator = features.FeatureConstructor() ordered_attributes_list = list(attributes_calculator.attributes_map.keys()) ordered_attributes_list.append("class") print(ordered_attributes_list) df = pd.concat([pd.read_csv(f,sep=",",names = ordered_attributes_list) for f in all_files]) #print(df) #df["sum_of_neighbors"].hist(bins=100) return df
def set_classification_dataset(self): """ Calculates the attributes for each example of the sample, and returns it as a matrix ready for applying the classification algorithms, in order to perform the link prediction. """ self.classification_dataset = np.zeros((self.sample_size, len(self.attributes_list) + 2)) line = 0 attributes_calculator = features.FeatureConstructor(self.graph_training) for edge in self.positive_examples.union(self.negative_examples): first_node, second_node = edge attributes_calculator.set_nodes(first_node, second_node) pair_class = 0 if edge in self.negative_examples else 1 column = 0 for function in self.ordered_attributes_list: parameters = self.attributes_list[function] self.classification_dataset[line][column] = attributes_calculator.attributes_map[function](**parameters) column += 1 self.classification_dataset[line][-2] = pair_class line += 1 self.normalize_attributes() self.set_dataset_folds() return self.classification_dataset
def expand_data(new_species_xlsx, output_hdf5, species_df_key, rxn_df_key, elements_csv, bonds_csv, new_xlsx_path): """ Helps to inject new data to the species dataframe as more CIDs are fetched manually :param new_species_xlsx: New Species xlsx file which stores newly fetched CIDs :param output_hdf5: Output HDF% file that houses species df :param species_df_key: Specied df key in output_hdf5 file :param rxn_df_key: :param elements_csv: :param bonds_csv: :param new_xlsx_path: :return: New data for ML experiments """ # Reading xlsx files which contains newly fetched PubChem IDs into pandas df new_df_from_xlsx = pd.read_excel(new_species_xlsx, header=0) # Reading old Species dataframe to which new PubChem ids have to be transfered old_df_from_hdf = pd.read_hdf(output_hdf5, species_df_key) # Setting 'Species' name as index for efficiency old_df_from_hdf = old_df_from_hdf.reset_index() old_df_from_hdf = old_df_from_hdf.set_index(keys="Species", verify_integrity=True) # Initializing FeatureConstructor my_constructor = ft.FeatureConstructor(elements_csv, bonds_csv) # Transfering CID, adding BondsInfo (stringified PubChem JSON), adding species feature vector new_species_count = 0 for idx, row in new_df_from_xlsx.iterrows(): if not math.isnan(row['CID']) and row['CID'] != "": if math.isnan(old_df_from_hdf.at[row['Species'], 'CID'] ) or old_df_from_hdf.at[row['Species'], 'CID'] == "": old_df_from_hdf.at[row['Species'], 'CID'] = row['CID'] pubchem_str_json = my_constructor.get_full(row['CID']) print("--Data fetched for CID {}--".format(int( row['CID']))) old_df_from_hdf.at[row['Species'], 'BondsInfo'] = pubchem_str_json old_df_from_hdf.at[ row['Species'], 'FeatureVector'] = my_constructor.bonds_count_json( None, pubchem_str_json) new_species_count = new_species_count + 1 print('--Status--') print('--{} New Species Added--'.format(new_species_count)) if new_species_count == 0: print( 'No new changes were made as there were no new species to add.' ) return else: # Updating HDF with updated species df old_df_from_hdf = old_df_from_hdf.reset_index() old_df_from_hdf = old_df_from_hdf.set_index(keys="SID", verify_integrity=True) old_df_from_hdf.to_hdf(output_hdf5, species_df_key) # Updating Reactions DF with new CID list rm.RecordMapper.map_rid_to_cid(output_hdf5, rxn_df_key, species_df_key) # Filetring out reactions whose feature vectors can be calculated reduced_rxn_df = Extender.get_rxn_subset(output_hdf5, rxn_df_key) # Creating feature vectors of the filtered out reactions reduced_rxn_df = my_constructor.bond_brk(output_hdf5, species_df_key, reduced_rxn_df) print('--Status--') print('--Reactions Feature Vectors Created--') # Creating the new reactions xlsx for ML Training reduced_rxn_df.to_excel(new_xlsx_path) print('--Status--') print('--Database Expansion Routine Complete--')