def __init__(self, dir_path, num_speakers=100, frames=64, silence=False, val_frac=0.2, id="", pkl_dir=""): self.frames = frames self.num_speakers = num_speakers self.val_frac = val_frac self.id = id self.pkl_dir = os.path.join(pkl_dir, self.id) self.silence = silence files = list( list_files(dir_path, lambda x: audio_predicate(x) and x.find(self.id) != -1)) speakers = list( set([os.path.basename(os.path.split(f)[0]) for f in files])) random.shuffle(speakers) speakers = dict( zip(speakers[:num_speakers], range(len(speakers[:num_speakers])))) self.speakers = speakers logging.info("Speakers: {}".format(self.speakers)) logging.info("Number of speakers: {}".format(len(self.speakers))) speaker_files, speaker_files_count, cnt = {}, defaultdict(float), 0 for f in files: speaker = os.path.basename(os.path.split(f)[0]) if speaker in self.speakers: speaker_files[cnt] = f speaker_files_count[speaker] += 1.0 cnt += 1 self.speaker_files = speaker_files logging.info("Number of speaker files: {}".format( len(self.speaker_files))) speaker_train_files, speaker_dev_files, speaker_curr_count, featurized = [], [], defaultdict(float), {} for i, f in self.speaker_files.items(): speaker = os.path.basename(os.path.split(f)[0]) if speaker_curr_count[speaker] < ( 1.0 - self.val_frac) * speaker_files_count[speaker]: speaker_train_files.append(i) featurized[i] = False speaker_curr_count[speaker] += 1.0 else: speaker_dev_files.append(i) featurized[i] = False speaker_curr_count[speaker] += 1.0 self.speaker_train_files = speaker_train_files self.speaker_dev_files = speaker_dev_files self.featurized = featurized self.dim = 39 logging.info("Train Speaker Files: {}".format( self.speaker_train_files)) logging.info("Dev Speaker Files: {}".format(self.speaker_dev_files)) logging.info("Input dimensions: {}".format(self.dim))
def download_compranet(years): """ Download Compranet data for a list of years, unzip the files and convert the XLS to CSV :param years: The years for which to download data :type years: List :returns: :example: """ tmp_folder = os.path.join(settings.folder_full_cache, 'tmp') check_create_folder(tmp_folder) for year in years: file_name = os.path.join(settings.fn_prefix + year + settings.fn_extension) src_url = settings.compranet_base_url + file_name print "Downloading %s" % file_name download(url=src_url, path=tmp_folder) file_path = os.path.join(tmp_folder, file_name) with zipfile.ZipFile(file_path, 'r') as myzip: myzip.extractall(tmp_folder) pattern = os.path.join(tmp_folder, '*.xls*') for src_file in list_files(pattern): csv_path = os.path.join(settings.folder_full_cache, get_filename(src_file) + '.csv') wb = xlrd.open_workbook(src_file) sheet = wb.sheet_by_index(0) with open(csv_path, 'w') as csvfile: writer = unicodecsv.writer(csvfile, encoding='utf-8') for rownum in xrange(sheet.nrows): writer.writerow(sheet.row_values(rownum)) remove_folder(tmp_folder)
def main(args): """ Main function - launches the program. :param args: The Parser arguments :type args: Parser object :returns: List :example: ["Downloading files from the Compranet site."] """ if args: if args.sample: source_folder = settings.folder_sample_data else: # Use cached versions of the source data in csv format source_folder = settings.folder_full_cache check_create_folder(source_folder) if args.download: clean_folder(source_folder) download_compranet(settings.years) # Check if there are CSV files in the sample folder pattern = os.path.join(source_folder, '*.csv') source_data = list_files(pattern) if source_data: print "About to clean the data" clean_df = clean.clean_csv(source_data) print "About to store it in OCDS format" ocds.generate_json(clean_df) else: return["No source data found. Make sure there is at least one CSV file in " + source_folder, 1] return["Prepared and cleaned the files from the Compranet site.",0]
def main(): start = time.time() param_args = parseyaml() pipe = False try_all = True if 'cpus' not in param_args: cpus = 1 if 'path_graph' not in param_args or pipe: pose_list = list_files(param_args['path']) for pose in pose_list: pose_store, prody_parsed = PDBParser(path=param_args['path'] + pose) selected_protein, selected_ligand = binding_pocket_selection( pose_store=pose_store, p=prody_parsed, ligand_name=param_args['ligand_name'], selection_radius=param_args['selection_radius'], center=param_args['center']) if param_args['nodes'] == 'atoms': ligand_path = ligand_parse_write( path=param_args['path'] + pose, out=param_args['output'], lig_name=param_args['ligand_name']) selected_ligand_at = ligand_atom_type_calc( ligand=selected_ligand, ligand_path=ligand_path) interactions, atom_types, ligand_atom_types, protein_atom_types = atomTypesDistanceCalc( binding_pocket=selected_protein, ligand=selected_ligand_at) final_weigths, atom_combinations = atomSubgraphsWeights(atom_interactions=interactions, types=atom_types, decay_function=param_args['decay_function'], ligand_atom_types=ligand_atom_types, protein_atom_types=protein_atom_types) elif param_args['nodes'] == 'elements': interactions, elements, ligand_elements, protein_elements = elementsDistanceCalc( binding_pocket=selected_protein, ligand=selected_ligand) final_weigths, atom_combinations = elementSubgraphsWeights(atom_interactions=interactions, types=elements, decay_function=param_args['decay_function'], ligand_atom_types=ligand_elements, protein_atom_types=protein_elements) L_mat = laplacianMatrix(weights=final_weigths, atom_combinations=atom_combinations) A_mat = adjacencyMatrix(weights=final_weigths, atom_combinations=atom_combinations) LP = laplacianStats(matrices=L_mat, pose=pose) AD = adjacencyStats(matrices=A_mat, pose=pose) statsToCsv(laplacian_statistics=LP, adjacency_statistics=AD, decay_function=param_args['decay_function'], nodes=param_args['nodes'], name=param_args['run_name'], out=param_args['output']) if 'path_graph' in param_args or pipe: if not pipe: data = read_graph_data(path=param_args['path_graph']) elif pipe: data = append_targets(name=param_args['run_name'], out=param_args['output'], target=param_args['target'], decay_function=param_args['decay_function']) data = dataset_preparation(data=data) LM = LearningModels() if param_args['task'] == 'classification': train, activity_train, test, activity_test = data_splitting_classification( data=data, test_size=param_args['test_size'], seed=param_args['seed']) if 'algorithm' in param_args: if param_args['algorithm'] == 'GBC': best_lr, best_train_acc, best_test_acc = LM.GBC_optimization( train=train, activity_train=activity_train, test=test, activity_test=activity_test) cf, report, MCC = LM.GBC(train=train, activity_train=activity_train, test=test, activity_test=activity_test, best_lr=best_lr) classification_outputs = {'GBC_optimization': [best_lr, best_train_acc, best_test_acc], 'GBC': [cf, report, MCC]} write_classification_report_GBR( classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out') elif param_args['algorithm'] == 'XGBC': XGB_accuracy, cf_XGB, report_XGB, MCC_XGB = LM.XGBoost( train=train, activity_train=activity_train, test=test, activity_test=activity_test) classification_outputs = { 'XGB': [XGB_accuracy, cf_XGB, report_XGB, MCC_XGB]} write_classification_report_XGBR( classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out') elif param_args['algorithm'] == 'LGBC': lgb_accuracy, cf_lgb, report_lgb, MCC_lgb = LM.LigthGB( train=train, activity_train=activity_train, test=test, activity_test=activity_test) classification_outputs = { 'LGBC': [lgb_accuracy, cf_lgb, report_lgb, MCC_lgb]} write_classification_report_LGBR( classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out') else: best_lr, best_train_acc, best_test_acc = LM.GBC_optimization( train=train, activity_train=activity_train, test=test, activity_test=activity_test) cf, report, MCC = LM.GBC(train=train, activity_train=activity_train, test=test, activity_test=activity_test, best_lr=best_lr) XGB_accuracy, cf_XGB, report_XGB, MCC_XGB = LM.XGBoost( train=train, activity_train=activity_train, test=test, activity_test=activity_test) lgb_accuracy, cf_lgb, report_lgb, MCC_lgb = LM.LigthGB( train=train, activity_train=activity_train, test=test, activity_test=activity_test) classification_outputs = {'GBC_optimization': [best_lr, best_train_acc, best_test_acc], 'GBC': [cf, report, MCC], 'XGB': [XGB_accuracy, cf_XGB, report_XGB, MCC_XGB], 'LGBC': [lgb_accuracy, cf_lgb, report_lgb, MCC_lgb]} write_classification_report(classification_outputs=classification_outputs, out_file=param_args['output'] + 'classification_report.out') elif param_args['task'] == 'regression': if param_args['algorithm'] == 'ffnn': S = Scaler() train, bindingEnergy_train, val, bindingEnergy_val, test, bindingEnergy_test, ligand_test, ligandRMSD_test = data_splitting_ffnn(data=data, seed=param_args['seed']) train, val, test = S.min_max_scaler_ffnn(train=train, val=val, test=test) if param_args['pelePrep'] == 'profile': NN_model = LM.FFNN_profile(train, val, test, param_args['learning_rate']) else: NN_model = LM.FFNN_clustering(train, val, test, param_args['learning_rate']) checkpoint_name = 'weights.hdf5' checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto') callbacks_list = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5), checkpoint] history = NN_model.fit(train, bindingEnergy_train, epochs=param_args['epochs'], batch_size=param_args['batch_size'], validation_data=(val, bindingEnergy_val), callbacks=callbacks_list) results_train = NN_model.evaluate(train, bindingEnergy_train) results_val = NN_model.evaluate(val, bindingEnergy_val) results_test = NN_model.evaluate(test, bindingEnergy_test) train_pred = NN_model.predict(train) val_pred = NN_model.predict(val) test_pred = NN_model.predict(test) r2_train = r2_score(bindingEnergy_train, train_pred) r2_val = r2_score(bindingEnergy_val, val_pred) r2_test = r2_score(bindingEnergy_test, test_pred) results_train.append(r2_train) results_val.append(r2_val) results_test.append(r2_test) results = [results_train, results_val, results_test] write_regression_report_ffnn(results, param_args, param_args['output']) train_plots(history, param_args['output']) if 'ligandRMSD' in list(data.columns): target_plot(test_pred, ligand_test, bindingEnergy_test, ligandRMSD_test, param_args['output']) else: if 'scaler' in param_args: best_scaler = None S = Scaler() scaling_functions = [(S.standard_scaler, "standard"), (S.min_max_scaler, 'min_max'), (S.power_scaler, 'power'), ( S.quantile_scaler_uniform, 'uniform'), (S.quantile_scaler_gauss, 'quantile_Gauss'), (S.robust_scaler, 'robust'), (S.max_abs_scaler, 'max_abs')] if param_args['scaler'] == 'search' and best_scaler is None: scalers_performance = [] for scale in scaling_functions: print("---------------------------------------") print("\nApplying {} scaler\n".format(scale[1])) try: train, test = scale[0](train=train, test=test) except ValueError: print( "{} scaler cannot be applied to this data values.\n".format(scale[1])) continue pred, R2_test, MSE, MAE, epoch_pred = LM.GBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=0.25) scalers_performance.append((R2_test, scale[0], scale[1])) scalers_best_performance = sorted(scalers_performance, key=lambda tup: tup[0]) best_scaler = scalers_best_performance[0] if (param_args['scaler'] == 'search' and best_scaler is not None) or (param_args['scaler'] != 'search'): for scaler in scaling_functions: if scaler[1] == param_args['scaler']: scale = scaler break print("---------------------------------------") print("\nApplying {} scaler\n".format(scale[1])) try: train, test = scale[0](train=train, test=test) except ValueError: print( "{} scaler cannot be applied to this data values.\n".format(scale[1])) if param_args['algorithm'] == 'GBR': best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test) pred, R2_test, MSE, MAE, epoch_pred = LM.GBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr) regression_outputs = {'scaler': scale, 'GBR_optimization': [ best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred]} write_regression_report_GBR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') elif param_args['algorithm'] == 'XGBR': XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2], 'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE]} write_regression_report_XGBR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') elif param_args['algorithm'] == 'LGBR': LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2], 'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE]} write_regression_report_LGBR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') elif param_args['algorithm'] == 'MLPR': MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'MLPR': [ params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]} write_regression_report_MLPR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') else: best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test) pred, R2_test, MSE, MAE, epoch_pred = LM.GBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr) XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus']) LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus']) MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'GBR_optimization': [best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred], 'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2], 'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE], 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2], 'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE], 'MLPR': [params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]} write_regression_report(regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') else: print("\nFitting models without scaling...it may work but be careful!") scale = None if 'algorithm' in param_args: if param_args['algorithm'] == 'GBR': best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test) pred, R2_test, MSE, MAE, epoch_pred = LM.GBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr) regression_outputs = {'scaler': scale, 'GBR_optimization': [ best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred]} write_regression_report_GBR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') elif param_args['algorithm'] == 'XGBR': XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2], 'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE]} write_regression_report_XGBR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') elif param_args['algorithm'] == 'LGBR': LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2], 'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE]} write_regression_report_LGBR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') elif param_args['algorithm'] == 'MLPR': MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'MLPR': [ params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]} write_regression_report_MLPR( regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') else: best_lr, best_train_R2, best_test_R2 = LM.GBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test) pred, R2_test, MSE, MAE, epoch_pred = LM.GBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=best_lr) XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2 = LM.XGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) XGBR_pred, XGBR_R2_test, XGBR_MSE, XGBR_MAE, = LM.XGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=XGBR_best_lr, cpus=param_args['cpus']) LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2 = LM.LGBR_optimization( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE = LM.LGBR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, best_lr=LGBR_best_lr, cpus=param_args['cpus']) MLPR_pred, MLPR_R2_test, MLPR_MSE, MLRP_MAE, params = LM.MLPR( train=train, bindingEnergy_train=bindingEnergy_train, test=test, bindingEnergy_test=bindingEnergy_test, cpus=param_args['cpus']) regression_outputs = {'scaler': scale, 'GBR_optimization': [best_lr, best_train_R2, best_test_R2], 'GBR': [R2_test, MSE, MAE, epoch_pred], 'XGBR_optimization': [XGBR_best_lr, XGBR_best_train_R2, XGBR_best_test_R2], 'XGBR': [XGBR_R2_test, XGBR_MSE, XGBR_MAE], 'LGBR_optimization': [LGBR_best_lr, LGBR_best_train_R2, LGBR_best_test_R2], 'LGBR': [LGBR_pred, LGBR_R2_test, LGBR_MSE, LGBR_MAE], 'MLPR': [params, MLPR_pred, MLPR_R2_test, MLPR_MSE, MLPR_MAE]} write_regression_report(regression_outputs=regression_outputs, out_file=param_args['output'] + 'regression_report.out') end = time.time() print("\nRun time: {} seconds.".format(end - start)) if 'path_graph' not in param_args: with open(param_args['output'] + "_run_times.txt", 'a') as rt: rt.write(str(end - start)) rt.write('\n') else: with open(param_args['output'] + param_args['run_name'] + "_run_times.txt", 'a') as rt: rt.write(str(end - start)) rt.write('\n')
def main(args): """ Main function - launches the program. """ if args: check_create_folder(settings.folder_charts) df = pd.DataFrame() # Read in the JSON files, flatten the contracts and add them to a DataFrame for f in list_files(args.source + '*'): df = flatten_contracts(f, df) # Improve df['contract_period_startDate'] = df['contract_period_startDate'].convert_objects(convert_dates='coerce') df['tender_publicationDate'] = df['tender_publicationDate'].convert_objects(convert_dates='coerce') df['tender_tenderPeriod_startDate'] = df['tender_tenderPeriod_startDate'].convert_objects(convert_dates='coerce') df['award_date'] = df['award_date'].convert_objects(convert_dates='coerce') # Cut every contract that's before a starting date start_date = datetime.strptime(settings.start_date_charts,'%Y-%m-%d') end_date = datetime.strptime(settings.end_date_charts,'%Y-%m-%d') df = df[(df[settings.main_date_contract] >= start_date) & (df[settings.main_date_contract] <= end_date)] # Generate the summary statistics, independent of comparison or slice overview_data = chartdata.generate_overview(df) with open(os.path.join(settings.folder_charts, 'general.json'), 'w') as outfile: json.dump(overview_data, outfile) for dimension in settings.dimensions: for comparison in settings.comparisons: # Each unique combination of dimension + comparison is a 'lense' lense_id = dimension + '--' + comparison['id'] lense = { 'metadata': { 'id': lense_id }, 'charts': [] } for chart in settings.charts: if chart['dimension'] == dimension: if chart['function']: chart['meta']['data'] = [] previous_slice = False d = { } # Generate the chart data for sl in comparison['slices']: sliced_chart = { 'id': sl['id'], 'label': sl['label'] } # Prep the dataframe, slice it or serve it full if comparison['compare']: sliced_df = slice_df(df, comparison['compare'], sl['field']) else: sliced_df = df if not sliced_df.empty: current_slice = chart['function'](sliced_df) # Append the slice's data & meta-data sliced_chart['data'] = current_slice['data'] chart['meta']['data'].append(sliced_chart) # Update the domain based on the slice for axis, func in chart['domain'].items(): if previous_slice: d[axis] = func(d[axis], current_slice['domain'][axis]) else: d[axis] = current_slice['domain'][axis] previous_slice = True # Add the domain to the chart for axis, func in chart['domain'].items(): chart['meta'][axis]['domain'] = d[axis] # Append the chart data lense['charts'].append(chart['meta']) file_name = os.path.join(settings.folder_charts,lense_id + '.json') with open(file_name, 'w') as outfile: json.dump(lense, outfile)
def main(args): """ Main function - launches the program. """ if args: check_create_folder(settings.folder_charts) df = pd.DataFrame() # Read in the JSON files, flatten the contracts and add them to a DataFrame for f in list_files(args.source + '*'): df = flatten_contracts(f, df) # Improve df['contract_period_startDate'] = df[ 'contract_period_startDate'].convert_objects( convert_dates='coerce') df['tender_publicationDate'] = df[ 'tender_publicationDate'].convert_objects(convert_dates='coerce') df['tender_tenderPeriod_startDate'] = df[ 'tender_tenderPeriod_startDate'].convert_objects( convert_dates='coerce') df['award_date'] = df['award_date'].convert_objects( convert_dates='coerce') # Cut every contract that's before a starting date start_date = datetime.strptime(settings.start_date_charts, '%Y-%m-%d') end_date = datetime.strptime(settings.end_date_charts, '%Y-%m-%d') df = df[(df[settings.main_date_contract] >= start_date) & (df[settings.main_date_contract] <= end_date)] # Generate the summary statistics, independent of comparison or slice overview_data = chartdata.generate_overview(df) with open(os.path.join(settings.folder_charts, 'general.json'), 'w') as outfile: json.dump(overview_data, outfile) for dimension in settings.dimensions: for comparison in settings.comparisons: # Each unique combination of dimension + comparison is a 'lense' lense_id = dimension + '--' + comparison['id'] lense = {'metadata': {'id': lense_id}, 'charts': []} for chart in settings.charts: if chart['dimension'] == dimension: if chart['function']: chart['meta']['data'] = [] previous_slice = False d = {} # Generate the chart data for sl in comparison['slices']: sliced_chart = { 'id': sl['id'], 'label': sl['label'] } # Prep the dataframe, slice it or serve it full if comparison['compare']: sliced_df = slice_df( df, comparison['compare'], sl['field']) else: sliced_df = df if not sliced_df.empty: current_slice = chart['function']( sliced_df) # Append the slice's data & meta-data sliced_chart['data'] = current_slice[ 'data'] chart['meta']['data'].append(sliced_chart) # Update the domain based on the slice for axis, func in chart['domain'].items(): if previous_slice: d[axis] = func( d[axis], current_slice['domain'][axis]) else: d[axis] = current_slice['domain'][ axis] previous_slice = True # Add the domain to the chart for axis, func in chart['domain'].items(): chart['meta'][axis]['domain'] = d[axis] # Append the chart data lense['charts'].append(chart['meta']) file_name = os.path.join(settings.folder_charts, lense_id + '.json') with open(file_name, 'w') as outfile: json.dump(lense, outfile)