示例#1
0
    def __init__(self,
                 dir_path,
                 num_speakers=100,
                 frames=64,
                 silence=False,
                 val_frac=0.2,
                 id="",
                 pkl_dir=""):
        self.frames = frames
        self.num_speakers = num_speakers
        self.val_frac = val_frac
        self.id = id
        self.pkl_dir = os.path.join(pkl_dir, self.id)
        self.silence = silence

        files = list(
            list_files(dir_path,
                       lambda x: audio_predicate(x) and x.find(self.id) != -1))
        speakers = list(
            set([os.path.basename(os.path.split(f)[0]) for f in files]))
        random.shuffle(speakers)
        speakers = dict(
            zip(speakers[:num_speakers], range(len(speakers[:num_speakers]))))

        self.speakers = speakers
        logging.info("Speakers: {}".format(self.speakers))
        logging.info("Number of speakers: {}".format(len(self.speakers)))

        speaker_files, speaker_files_count, cnt = {}, defaultdict(float), 0
        for f in files:
            speaker = os.path.basename(os.path.split(f)[0])
            if speaker in self.speakers:
                speaker_files[cnt] = f
                speaker_files_count[speaker] += 1.0
                cnt += 1
        self.speaker_files = speaker_files
        logging.info("Number of speaker files: {}".format(
            len(self.speaker_files)))

        speaker_train_files, speaker_dev_files, speaker_curr_count, featurized = [], [], defaultdict(float), {}
        for i, f in self.speaker_files.items():
            speaker = os.path.basename(os.path.split(f)[0])
            if speaker_curr_count[speaker] < (
                    1.0 - self.val_frac) * speaker_files_count[speaker]:
                speaker_train_files.append(i)
                featurized[i] = False
                speaker_curr_count[speaker] += 1.0
            else:
                speaker_dev_files.append(i)
                featurized[i] = False
                speaker_curr_count[speaker] += 1.0
        self.speaker_train_files = speaker_train_files
        self.speaker_dev_files = speaker_dev_files
        self.featurized = featurized
        self.dim = 39

        logging.info("Train Speaker Files: {}".format(
            self.speaker_train_files))
        logging.info("Dev Speaker Files: {}".format(self.speaker_dev_files))
        logging.info("Input dimensions: {}".format(self.dim))
def download_compranet(years):
  """
  Download Compranet data for a list of years, unzip the files and convert 
  the XLS to CSV

  :param years:
    The years for which to download data
  :type years:
    List

  :returns:

  :example:

  """
  
  tmp_folder = os.path.join(settings.folder_full_cache, 'tmp')
  check_create_folder(tmp_folder)

  for year in years:
    file_name = os.path.join(settings.fn_prefix + year + settings.fn_extension)
    src_url = settings.compranet_base_url + file_name

    print "Downloading %s" % file_name
    download(url=src_url, path=tmp_folder) 

    file_path = os.path.join(tmp_folder, file_name)
    with zipfile.ZipFile(file_path, 'r') as myzip:
      myzip.extractall(tmp_folder)

  pattern = os.path.join(tmp_folder, '*.xls*')

  for src_file in list_files(pattern):
    csv_path = os.path.join(settings.folder_full_cache, get_filename(src_file) + '.csv')
    wb = xlrd.open_workbook(src_file)
    sheet = wb.sheet_by_index(0)

    with open(csv_path, 'w') as csvfile:
      writer = unicodecsv.writer(csvfile, encoding='utf-8')
      for rownum in xrange(sheet.nrows):
        writer.writerow(sheet.row_values(rownum))

  remove_folder(tmp_folder)
def main(args):
  """
  Main function - launches the program.
  :param args:
    The Parser arguments
  :type args:
    Parser object
  :returns:
    List  
  :example:
    ["Downloading files from the Compranet site."]
  """
  
  if args:

    if args.sample:
      source_folder = settings.folder_sample_data
    
    else:
      # Use cached versions of the source data in csv format
      source_folder = settings.folder_full_cache      
      check_create_folder(source_folder)
      
      if args.download:
        clean_folder(source_folder)
        download_compranet(settings.years)
        
    # Check if there are CSV files in the sample folder
    pattern = os.path.join(source_folder, '*.csv')
    source_data = list_files(pattern)

    if source_data:

      print "About to clean the data"
      clean_df = clean.clean_csv(source_data)
      
      print "About to store it in OCDS format"
      ocds.generate_json(clean_df)

    else:
      return["No source data found. Make sure there is at least one CSV file in " + source_folder, 1]

    return["Prepared and cleaned the files from the Compranet site.",0]
示例#4
0
def main():

    start = time.time()

    param_args = parseyaml()

    pipe = False
    try_all = True

    if 'cpus' not in param_args:
        cpus = 1

    if 'path_graph' not in param_args or pipe:

        pose_list = list_files(param_args['path'])

        for pose in pose_list:

            pose_store, prody_parsed = PDBParser(path=param_args['path'] + pose)
            selected_protein, selected_ligand = binding_pocket_selection(
                pose_store=pose_store, p=prody_parsed, ligand_name=param_args['ligand_name'], selection_radius=param_args['selection_radius'], center=param_args['center'])

            if param_args['nodes'] == 'atoms':
                ligand_path = ligand_parse_write(
                    path=param_args['path'] + pose, out=param_args['output'], lig_name=param_args['ligand_name'])
                selected_ligand_at = ligand_atom_type_calc(
                    ligand=selected_ligand, ligand_path=ligand_path)
                interactions, atom_types, ligand_atom_types, protein_atom_types = atomTypesDistanceCalc(
                    binding_pocket=selected_protein, ligand=selected_ligand_at)
                final_weigths, atom_combinations = atomSubgraphsWeights(atom_interactions=interactions, types=atom_types, decay_function=param_args['decay_function'],
                                                                        ligand_atom_types=ligand_atom_types, protein_atom_types=protein_atom_types)
            elif param_args['nodes'] == 'elements':
                interactions, elements, ligand_elements, protein_elements = elementsDistanceCalc(
                    binding_pocket=selected_protein, ligand=selected_ligand)
                final_weigths, atom_combinations = elementSubgraphsWeights(atom_interactions=interactions, types=elements, decay_function=param_args['decay_function'],
                                                                           ligand_atom_types=ligand_elements, protein_atom_types=protein_elements)

            L_mat = laplacianMatrix(weights=final_weigths,
                                    atom_combinations=atom_combinations)
            A_mat = adjacencyMatrix(weights=final_weigths,
                                    atom_combinations=atom_combinations)

            LP = laplacianStats(matrices=L_mat, pose=pose)
            AD = adjacencyStats(matrices=A_mat, pose=pose)

            statsToCsv(laplacian_statistics=LP, adjacency_statistics=AD,
                       decay_function=param_args['decay_function'], nodes=param_args['nodes'], name=param_args['run_name'], out=param_args['output'])

    if 'path_graph' in param_args or pipe:

        if not pipe:

            data = read_graph_data(path=param_args['path_graph'])

        elif pipe:

            data = append_targets(name=param_args['run_name'], out=param_args['output'],
                                  target=param_args['target'], decay_function=param_args['decay_function'])

        data = dataset_preparation(data=data)

        LM = LearningModels()

        if param_args['task'] == 'classification':
            train, activity_train, test, activity_test = data_splitting_classification(
                data=data, test_size=param_args['test_size'], seed=param_args['seed'])

            if 'algorithm' in param_args:

                if param_args['algorithm'] == 'GBC':

                    best_lr, best_train_acc, best_test_acc = LM.GBC_optimization(
                        train=train, activity_train=activity_train, test=test, activity_test=activity_test)
                    cf, report, MCC = LM.GBC(train=train, activity_train=activity_train,
                                             test=test, activity_test=activity_test, best_lr=best_lr)

                    classification_outputs = {'GBC_optimization': [best_lr, best_train_acc, best_test_acc],
                                              'GBC': [cf, report, MCC]}

                    write_classification_report_GBR(
                        classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out')

                elif param_args['algorithm'] == 'XGBC':

                    XGB_accuracy, cf_XGB, report_XGB, MCC_XGB = LM.XGBoost(
                        train=train, activity_train=activity_train, test=test, activity_test=activity_test)

                    classification_outputs = {
                        'XGB': [XGB_accuracy, cf_XGB, report_XGB, MCC_XGB]}

                    write_classification_report_XGBR(
                        classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out')

                elif param_args['algorithm'] == 'LGBC':

                    lgb_accuracy, cf_lgb, report_lgb, MCC_lgb = LM.LigthGB(
                        train=train, activity_train=activity_train, test=test, activity_test=activity_test)

                    classification_outputs = {
                        'LGBC': [lgb_accuracy, cf_lgb, report_lgb, MCC_lgb]}

                    write_classification_report_LGBR(
                        classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out')

            else:

                best_lr, best_train_acc, best_test_acc = LM.GBC_optimization(
                    train=train, activity_train=activity_train, test=test, activity_test=activity_test)
                cf, report, MCC = LM.GBC(train=train, activity_train=activity_train,
                                         test=test, activity_test=activity_test, best_lr=best_lr)

                XGB_accuracy, cf_XGB, report_XGB, MCC_XGB = LM.XGBoost(
                    train=train, activity_train=activity_train, test=test, activity_test=activity_test)

                lgb_accuracy, cf_lgb, report_lgb, MCC_lgb = LM.LigthGB(
                    train=train, activity_train=activity_train, test=test, activity_test=activity_test)

                classification_outputs = {'GBC_optimization': [best_lr, best_train_acc, best_test_acc],
                                          'GBC': [cf, report, MCC], 'XGB': [XGB_accuracy, cf_XGB, report_XGB, MCC_XGB],
                                          'LGBC': [lgb_accuracy, cf_lgb, report_lgb, MCC_lgb]}

                write_classification_report(classification_outputs=classification_outputs,
                                            out_file=param_args['output'] + 'classification_report.out')

        elif param_args['task'] == 'regression':

            if param_args['algorithm'] == 'ffnn':

                S = Scaler()

                train, bindingEnergy_train, val, bindingEnergy_val, test, bindingEnergy_test, ligand_test, ligandRMSD_test = data_splitting_ffnn(data=data,
                                                                        seed=param_args['seed'])

                train, val, test = S.min_max_scaler_ffnn(train=train, val=val, test=test)

                if param_args['pelePrep'] == 'profile':
                    NN_model = LM.FFNN_profile(train, val, test, param_args['learning_rate'])
                else:
                    NN_model = LM.FFNN_clustering(train, val, test, param_args['learning_rate'])

                checkpoint_name = 'weights.hdf5'
                checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
                callbacks_list = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5), checkpoint]

                history = NN_model.fit(train, bindingEnergy_train, epochs=param_args['epochs'],
                                        batch_size=param_args['batch_size'], validation_data=(val, bindingEnergy_val),
                                        callbacks=callbacks_list)

                results_train = NN_model.evaluate(train, bindingEnergy_train)
                results_val = NN_model.evaluate(val, bindingEnergy_val)
                results_test = NN_model.evaluate(test, bindingEnergy_test)

                train_pred = NN_model.predict(train)
                val_pred = NN_model.predict(val)
                test_pred = NN_model.predict(test)

                r2_train = r2_score(bindingEnergy_train, train_pred)
                r2_val = r2_score(bindingEnergy_val, val_pred)
                r2_test = r2_score(bindingEnergy_test, test_pred)

                results_train.append(r2_train)
                results_val.append(r2_val)
                results_test.append(r2_test)

                results = [results_train, results_val, results_test]

                write_regression_report_ffnn(results, param_args, param_args['output'])

                train_plots(history, param_args['output'])

                if 'ligandRMSD' in list(data.columns):

                    target_plot(test_pred, ligand_test, bindingEnergy_test, ligandRMSD_test, param_args['output'])

            else:

                if 'scaler' in param_args:

                    best_scaler = None
                    S = Scaler()

                    scaling_functions = [(S.standard_scaler, "standard"), (S.min_max_scaler, 'min_max'), (S.power_scaler, 'power'), (
                        S.quantile_scaler_uniform, 'uniform'), (S.quantile_scaler_gauss, 'quantile_Gauss'), (S.robust_scaler, 'robust'), (S.max_abs_scaler, 'max_abs')]

                    if param_args['scaler'] == 'search' and best_scaler is None:

                        scalers_performance = []

                        for scale in scaling_functions:

                            print("---------------------------------------")
                            print("\nApplying {} scaler\n".format(scale[1]))

                            try:
                                train, test = scale[0](train=train, test=test)

                            except ValueError:
                                print(
                                    "{} scaler cannot be applied to this data values.\n".format(scale[1]))
                                continue

                            pred, R2_test, MSE, MAE, epoch_pred = LM.GBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=0.25)

                            scalers_performance.append((R2_test, scale[0], scale[1]))

                        scalers_best_performance = sorted(scalers_performance, key=lambda tup: tup[0])

                        best_scaler = scalers_best_performance[0]

                    if (param_args['scaler'] == 'search' and best_scaler is not None) or (param_args['scaler'] != 'search'):

                        for scaler in scaling_functions:
                            if scaler[1] == param_args['scaler']:
                                scale = scaler
                                break

                        print("---------------------------------------")
                        print("\nApplying {} scaler\n".format(scale[1]))

                        try:
                            train, test = scale[0](train=train, test=test)

                        except ValueError:
                            print(
                                "{} scaler cannot be applied to this data values.\n".format(scale[1]))

                        if param_args['algorithm'] == 'GBR':

                            best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test)
                            pred, R2_test, MSE, MAE, epoch_pred = LM.GBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr)

                            regression_outputs = {'scaler': scale, 'GBR_optimization': [
                                best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred]}

                            write_regression_report_GBR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        elif param_args['algorithm'] == 'XGBR':

                            XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                            XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2],
                                                  'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE]}

                            write_regression_report_XGBR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        elif param_args['algorithm'] == 'LGBR':

                            LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                            LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2],
                                                  'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE]}

                            write_regression_report_LGBR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        elif param_args['algorithm'] == 'MLPR':

                            MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'MLPR': [
                                params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]}

                            write_regression_report_MLPR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        else:

                            best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test)
                            pred, R2_test, MSE, MAE, epoch_pred = LM.GBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr)

                            XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                            XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus'])

                            LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                            LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus'])

                            MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'GBR_optimization': [best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred],
                                                  'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2],
                                                  'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE], 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2],
                                                  'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE], 'MLPR': [params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]}

                            write_regression_report(regression_outputs=regression_outputs,
                                                    out_file=param_args['output'] + 'regression_report.out')

                else:

                    print("\nFitting models without scaling...it may work but be careful!")

                    scale = None

                    if 'algorithm' in param_args:
                        if param_args['algorithm'] == 'GBR':

                            best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test)
                            pred, R2_test, MSE, MAE, epoch_pred = LM.GBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr)

                            regression_outputs = {'scaler': scale, 'GBR_optimization': [
                                best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred]}

                            write_regression_report_GBR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        elif param_args['algorithm'] == 'XGBR':

                            XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                            XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2],
                                                  'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE]}

                            write_regression_report_XGBR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        elif param_args['algorithm'] == 'LGBR':

                            LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                            LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2],
                                                  'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE]}

                            write_regression_report_LGBR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                        elif param_args['algorithm'] == 'MLPR':

                            MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR(
                                train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])

                            regression_outputs = {'scaler': scale, 'MLPR': [
                                params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]}

                            write_regression_report_MLPR(
                                regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out')

                    else:

                        best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test)
                        pred, R2_test, MSE, MAE, epoch_pred = LM.GBR(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr)

                        XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                        XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus'])

                        LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])
                        LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus'])

                        MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR(
                            train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus'])

                        regression_outputs = {'scaler': scale, 'GBR_optimization': [best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred],
                                              'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2],
                                              'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE], 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2],
                                              'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE], 'MLPR': [params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]}

                        write_regression_report(regression_outputs=regression_outputs,
                                                out_file=param_args['output'] + 'regression_report.out')

    end = time.time()

    print("\nRun time: {} seconds.".format(end - start))

    if 'path_graph' not in param_args:
        with open(param_args['output'] + "_run_times.txt", 'a') as rt:
            rt.write(str(end - start))
            rt.write('\n')

    else:
        with open(param_args['output'] + param_args['run_name'] + "_run_times.txt", 'a') as rt:
            rt.write(str(end - start))
            rt.write('\n')
def main(args):
  """
  Main function - launches the program.
  """

  if args:
    check_create_folder(settings.folder_charts)

    df = pd.DataFrame()

    # Read in the JSON files, flatten the contracts and add them to a DataFrame
    for f in list_files(args.source + '*'):
      df = flatten_contracts(f, df)

    # Improve
    df['contract_period_startDate'] = df['contract_period_startDate'].convert_objects(convert_dates='coerce')
    df['tender_publicationDate'] = df['tender_publicationDate'].convert_objects(convert_dates='coerce')
    df['tender_tenderPeriod_startDate'] = df['tender_tenderPeriod_startDate'].convert_objects(convert_dates='coerce')
    df['award_date'] = df['award_date'].convert_objects(convert_dates='coerce')


    # Cut every contract that's before a starting date
    start_date = datetime.strptime(settings.start_date_charts,'%Y-%m-%d')
    end_date = datetime.strptime(settings.end_date_charts,'%Y-%m-%d')
    df = df[(df[settings.main_date_contract] >= start_date) & (df[settings.main_date_contract] <= end_date)]

    # Generate the summary statistics, independent of comparison or slice
    overview_data = chartdata.generate_overview(df)

    with open(os.path.join(settings.folder_charts, 'general.json'), 'w') as outfile:
      json.dump(overview_data, outfile)

    for dimension in settings.dimensions:
      for comparison in settings.comparisons:

        # Each unique combination of dimension + comparison is a 'lense'
        lense_id = dimension + '--' + comparison['id']
        lense = { 
          'metadata': { 
            'id': lense_id
          },
          'charts': []
        }

        for chart in settings.charts:
          if chart['dimension'] == dimension:
            if chart['function']:
              chart['meta']['data'] = []
         
              previous_slice = False
              d = { }

              # Generate the chart data
              for sl in comparison['slices']:
                sliced_chart = { 'id': sl['id'], 'label': sl['label'] }
                
                # Prep the dataframe, slice it or serve it full
                if comparison['compare']:
                  sliced_df = slice_df(df, comparison['compare'], sl['field'])
                else:
                  sliced_df = df

                if not sliced_df.empty:
                  current_slice = chart['function'](sliced_df)

                  # Append the slice's data & meta-data 
                  sliced_chart['data'] = current_slice['data']
                  chart['meta']['data'].append(sliced_chart)
                  
                  # Update the domain based on the slice
                  for axis, func in chart['domain'].items():
                    if previous_slice:
                      d[axis] = func(d[axis], current_slice['domain'][axis])
                    else:
                      d[axis] = current_slice['domain'][axis]
                    
                  previous_slice = True


              # Add the domain to the chart
              for axis, func in chart['domain'].items():
                chart['meta'][axis]['domain'] = d[axis]
              
            # Append the chart data
            lense['charts'].append(chart['meta'])

        file_name = os.path.join(settings.folder_charts,lense_id + '.json')
        with open(file_name, 'w') as outfile:
          json.dump(lense, outfile)
def main(args):
    """
  Main function - launches the program.
  """

    if args:
        check_create_folder(settings.folder_charts)

        df = pd.DataFrame()

        # Read in the JSON files, flatten the contracts and add them to a DataFrame
        for f in list_files(args.source + '*'):
            df = flatten_contracts(f, df)

        # Improve
        df['contract_period_startDate'] = df[
            'contract_period_startDate'].convert_objects(
                convert_dates='coerce')
        df['tender_publicationDate'] = df[
            'tender_publicationDate'].convert_objects(convert_dates='coerce')
        df['tender_tenderPeriod_startDate'] = df[
            'tender_tenderPeriod_startDate'].convert_objects(
                convert_dates='coerce')
        df['award_date'] = df['award_date'].convert_objects(
            convert_dates='coerce')

        # Cut every contract that's before a starting date
        start_date = datetime.strptime(settings.start_date_charts, '%Y-%m-%d')
        end_date = datetime.strptime(settings.end_date_charts, '%Y-%m-%d')
        df = df[(df[settings.main_date_contract] >= start_date)
                & (df[settings.main_date_contract] <= end_date)]

        # Generate the summary statistics, independent of comparison or slice
        overview_data = chartdata.generate_overview(df)

        with open(os.path.join(settings.folder_charts, 'general.json'),
                  'w') as outfile:
            json.dump(overview_data, outfile)

        for dimension in settings.dimensions:
            for comparison in settings.comparisons:

                # Each unique combination of dimension + comparison is a 'lense'
                lense_id = dimension + '--' + comparison['id']
                lense = {'metadata': {'id': lense_id}, 'charts': []}

                for chart in settings.charts:
                    if chart['dimension'] == dimension:
                        if chart['function']:
                            chart['meta']['data'] = []

                            previous_slice = False
                            d = {}

                            # Generate the chart data
                            for sl in comparison['slices']:
                                sliced_chart = {
                                    'id': sl['id'],
                                    'label': sl['label']
                                }

                                # Prep the dataframe, slice it or serve it full
                                if comparison['compare']:
                                    sliced_df = slice_df(
                                        df, comparison['compare'], sl['field'])
                                else:
                                    sliced_df = df

                                if not sliced_df.empty:
                                    current_slice = chart['function'](
                                        sliced_df)

                                    # Append the slice's data & meta-data
                                    sliced_chart['data'] = current_slice[
                                        'data']
                                    chart['meta']['data'].append(sliced_chart)

                                    # Update the domain based on the slice
                                    for axis, func in chart['domain'].items():
                                        if previous_slice:
                                            d[axis] = func(
                                                d[axis],
                                                current_slice['domain'][axis])
                                        else:
                                            d[axis] = current_slice['domain'][
                                                axis]

                                    previous_slice = True

                            # Add the domain to the chart
                            for axis, func in chart['domain'].items():
                                chart['meta'][axis]['domain'] = d[axis]

                        # Append the chart data
                        lense['charts'].append(chart['meta'])

                file_name = os.path.join(settings.folder_charts,
                                         lense_id + '.json')
                with open(file_name, 'w') as outfile:
                    json.dump(lense, outfile)