def load(args): """Load the needed data according to the arguments given. If needed, given elements are removed or as only ones preserved. Args: args (object): An argparse Namespace that holds the command-line arguments as its attributes. Returns: names (array): The names of the data in the rows x (array): The data matrix """ names = None x = None if args.information is None: names, x = ld.load_features(args.path, args.file, normalize=args.no_normalize, add_information=args.add_information, expand=args.expand, degree=args.degree, dimensions=args.information) else: names, x = ld.load_features(args.path, args.file, normalize=args.no_normalize, add_information=args.add_information, expand=args.expand, degree=args.degree) if args.elements is not None: names, x = ei.select_elements_from_data(names, x, args.elements, args.remove) return names, x
def prepare_data(path, filename_features, filename_energies, normalize=True, expand=False, degree=2, remove=False, elements=[]): """Loads and prepares the data necessary for the plotting Args: path (str): path containing the data filename_features (str): name of the features data set file filename_energies (str): name of file for the energies associated to the molecules normalize (bool): if True, normalizes the data matrix (features) expand (bool): if True, makes column-wise expansion of the data matrix (features) degree (int): degree of expansion. Only works if expand=True remove (bool): if True, removes from the data the molecules containing the elements in the list elements elements (list): if remove=True, then all molecules elements specified in this list will be removed from the data Returns: names: names of the load molecules x: data matrix (features) energies: Delta G(Rxn A) energies of the molecules not_normalized_data: non normalized data matrix (features) min_energy: minimum present in all energies max_energy: maximum present in all energies Note: all the data is being shuffled (but preserving the order between names, x, energies and not_normalized_data) """ names, x = ld.load_features(path, filename_features, expand=expand, degree=degree, normalize=normalize) _, not_normalized_data = ld.load_features(path, filename_features, expand=False, normalize=False) energies, energies_names = ld.load_energies(path, filename_energies) min_energy = np.min(energies) max_energy = np.max(energies) if elements is not None and len(elements) > 0: names, x = ei.select_elements_from_data(names, x, elements, remove=remove) _, not_normalized_data = ei.select_elements_from_data( names, not_normalized_data, elements, remove=remove) energies_names, energies = ei.select_energies_from_data(energies_names, energies, elements, remove=remove) indices = np.random.permutation(names.shape[0]) return names[indices], x[indices], energies[indices], not_normalized_data[ indices], min_energy, max_energy
import cross_validation import load_data import pandas as pd data_path = 'Data/białaczka.XLS' features_path = 'Data/features.txt' #load features features = load_data.load_features(features_path) # load data (X, y) = load_data.load_data_from_files(data_path, features) # params k_best_features = [10, 15, 20] neurons_in_hidden_layer = [32, 64, 256] momentum = [0, 0.9] # data frame df = pd.DataFrame(columns=[ 'Best features', 'Neurons in hidden layer', 'Momentum', 'Average' ]) for i in range(len(k_best_features)): for j in range(len(neurons_in_hidden_layer)): for k in range(len(momentum)): scores = cross_validation.run_crossvalid( X, y, k_best_features[i], neurons_in_hidden_layer[j], momentum[k]) average = 0 if (len(scores) != 0): average = sum(scores) / len(scores)
import load_data import select_features import cross_validation import model_benchmark import statistical_analysis no_of_crossvalid_runs = 2 no_of_folds = 5 # load list of features from file. why? because i thought that putting this list in the txt file would be more useful than hardcoing into the array in python code. features = load_data.load_features() # scan all files (names hardcoded) from data dir and loads all samples into one big array (actually it's DataFrame, but whatever). (X_features, Y_diagnosis) = load_data.load_data_from_files(features) # creates and prints feature ranking using all samples. #feature_ranking = select_features.create_feature_ranking(X_features, Y_diagnosis) #print(feature_ranking) # run and print score of one cross_validation with sample params. #score = cross_validation.run_crossvalid(X_features, Y_diagnosis, 2, 3, 5, 'manhattan', 420) #print(score) # run a function that tests diffrent set of parameters for knn cross_validation run_results = model_benchmark.run(X_features, Y_diagnosis, no_of_crossvalid_runs, no_of_folds) print(run_results) analysis_result = statistical_analysis.run(run_results, no_of_crossvalid_runs, no_of_folds) print(analysis_result)