def main(): """ Main call to the data_provider scripts. :return: None """ # properties_file = get_args() options = process_arguments(sys.argv[1:]) properties_file = options.properties_file prop = properties(properties_file) """ From class SelectaDB Return a propertie object """ conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost, prop.dbname, prop.dbport) """ From PostGreSQL return a connection object to localhost """ workdir = prop.workdir_input lsf = prop.lsf error_list = list() accounts = get_datahub_accounts(conn) """ Accounts is list of dictionaries of users accounts and passwords """ selections = get_selection_to_attributes_account(conn) print('-' * 100) print("SELECTION_LIST: {}".format(selections)) print('-' * 100) """Selections is a list of datahub account, eg: dcc_allison """ global log_file log_file = 'log.txt' final_errors = '' for account in accounts: print('-' * 100) print('Looping in account lists {}'.format(account)) print(selections) print('-' * 100) account_id = account['account_id'] if account_id in selections and account_id != 'dcc_fake': outputfile, job = fetch_datahub_metadatafile(account, workdir, lsf) """Get metadata per datahub accounts""" localinfo = "account {} has the corresponding metadatafile {} ".format( account, outputfile) print(ruler) print(localinfo) print(ruler) """ GET_SELECTION_INFO return a SELECTADB.class.SELECTION object this object is made up of selection_id, tax_id, study_acc, run_acc, pipeline_name, analysis_id, public, webin, datahub(eg:dcc_allison). These are fetched from process_selection table """ selection_all = get_selection_info(conn, account_id) """ """ print('-' * 100) print('\n'.join(map(str, [a.__dict__ for a in selection_all]))) print('-' * 100) if os.stat(outputfile).st_size == 0: for select in selection_all: print('File size is zero .....') message = "Failed to download the metadata file {} from account {}".format( outputfile, account_id) log_error(message, 'error', log_file) error_list.append(message) print(outputfile, select.selection_id, select.pipeline_name, select.datahub, select.study_accession, file=sys.stdout) final_errors = ' '.join( str(v).replace("'", "") for v in error_list) final_errors.replace("'", "") # TODO: change it in a way if it doesn't find the metadata file, just log it into log file and clear the # start column to be able to re-run it again print(ruler) print(textwrap.fill(final_errors, 250)) print("SELECION_ID: {} ".format(select.selection_id)) print(ruler) set_error(conn, select.selection_id, final_errors) else: print('Metadata found with following id:{}'.format(account_id), file=sys.stdout) try: attributes_all = get_default_attributes(outputfile) print('_' * 100) print( "Processing datahub :{}\nInitial attribute_all length {}" .format(account_id, len(attributes_all))) print('_' * 100) all_run_accs = [] """ GET_DEFAULT_ATTRIBUTE, extract from the metadata tab delimited file the following information: tax_id scientific_name sample_accession secondary_sample_accession experiment_accession study_accession secondary_study_accession run_accession center_name instrument_model fastq_ftp fastq_md5 attributes_all is a list containing all the data for a given dcc_user, with process_id,selection_id,datahub set to '' since the latter are missing from the metadata file. """ except: # TODO in case the study exist, we shouldn't through database error and just log it and clear the # satrt column to be able to re-run it again print("ERROR: Cannot process {} file".format(outputfile), file=sys.stderr) # message="Exception: "+str(sys.exc_info()[1]) exc_type, exc_value, exc_traceback = sys.exc_info() err_lines = traceback.format_exception( exc_type, exc_value, exc_traceback) message = '\n'.join(err_lines) print("message:{}".format(message)) error_list.append(message) error_list.append( "ERROR: Cannot process {} file".format(outputfile)) print("Exception: {}".format(message), file=sys.stderr) for select in selection_all: """ selection_all= selection_id, tax_id, study_acc, run_acc, pipeline_name, analysis_id, public, webin, selection_to_attribute_end, process_type, continuity, datahub(eg:dcc_allison). These are fetch from process_selection. What is the processing type: """ print('-' * 100) print( "Processing Selection_id:{}\nwith process_type:{}\nand continuity:{}" .format(select.selection_id, select.process_type, select.continuity)) print('-' * 100) processing_type = select.process_type continuity = select.continuity selection_to_attribute_end = select.selection_to_attribute_end if not error_list: print( "\n\nThe processing of the metadata with project:{} datahub:{} had zero errors:\n" .format(select.study_accession, select.datahub)) try: if not check_started(conn, select.selection_id): """" Check_started : return true or false for selection_process_start ie Is selection_id from process_selection has (selection_to_attribute_start null) """ set_started(conn, select.selection_id) """" set_started: set selection_to_attribute_start to now() for selection_id in process_selection Here we have to make use of the continuity value in select object to proceed further """ #study_to_run_acc = list() #datahub_to_run_acc = list() #run_to_run_acc = list() if continuity.lower() == "yes": """ update the process_stage and process_report table for subset of run_id not already processed """ print("CONTINUITY YES met") already_ran_run_accs = already_ran_runs( conn, select.selection_id, processing_type) print("Already ran runs: {}".format( len(already_ran_run_accs))) """ Excluded already processed runs if any """ print([ attr.run_accession for attr in attributes_all ][1:10]) attributes_all = exclude_processed_run( select, attributes_all, already_ran_run_accs) print('*' * 100) print([ attr.run_accession for attr in attributes_all ][1:10]) print('*' * 100) print('*' * 100) print("New attribute_all length {}".format( len(attributes_all))) print(attributes_all) print('*' * 100) for attr in attributes_all: """ Select runs under study_id in process selection """ if processing_type.lower() == 'study' and \ select.study_accession.strip() == attr.study_accession.strip(): report_process = Process_report( select, attr, error_list) report_process.log_process_report_info( conn) elif processing_type.lower() == 'run' and \ select.run_accession.strip() == attr.run_accession.strip(): print("*" * 100) print(processing_type) print("*" * 100) all_run_accs = [ attr.run_accession for attr in attributes_all if select.run_accession.strip() == attr.run_accession.strip() ] """ Select specific run id for processing """ report_process = Process_report( select, attr, error_list) report_process.log_process_report_info( conn) elif processing_type.lower() == 'datahub': all_run_accs = [ attr.run_accession for attr in attributes_all ] """ Select all runs under the dcc_hub account for processing """ report_process = Process_report( select, attr, error_list) report_process.log_process_report_info( conn) except: print( "ERROR: Cannot process selection_id {}".format( select.selection_id), file=sys.stderr) #message = str(sys.exc_info()[1]) message = str(sys.exc_info()) error_list.append(message) print("Exception: {}".format(message), file=sys.stderr) final_errors = ' '.join( str(v).replace("'", "") for v in error_list) print('*' * 100) print(final_errors) print('*' * 100) set_error(conn, select.selection_id, final_errors.replace("'", "")) error_list = list() else: final_errors = ' '.join( str(v).replace("'", "") for v in error_list) print("ERRORs: {}".format(final_errors), file=sys.stderr) set_error(conn, select.selection_id, final_errors.replace("'", "")) print('-' * 100) """ Amend various SELECTA table with metadata from each SELECTION_ID""" print('SELECT From SELECTION_ALL next Loop') error_list = list() conn.close()
post_submission_error = post_submission(submission_error_messages, returncode, out, err) if post_submission_error != '': #pass error_list.append(post_submission_error) terminate(conn, analysis_reporter_stage, extract_analysis_id(out), extract_submission_id(out)) error_list = list() return True else: return False if __name__ == '__main__': get_args() prop = properties(properties_file) print('-' * 100) print('-' * 100) conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost, prop.dbname, prop.dbport) print('-' * 100) print("Pipeline_version:{}".format(prop.uantwerp_bacpipe_version)) print("analysis_submission_url_dev:", prop.analysis_submission_url_dev) print("analysis_submission_url_prod:", prop.analysis_submission_url_prod) print("analysis_submission_action:", prop.analysis_submission_action) print("analysis_submission_mode:", prop.analysis_submission_mode) print("DB_NAME: {}".format(prop.dbname)) print("DB_HOST: {}".format(prop.dbhost)) print('-' * 100) analysis_reporter_list = get_list(conn) error_list = list()
def main(): """ Main call to the data_provider scripts. :return: None """ # options = process_arguments(sys.argv[1:]) properties_file = options.properties_file prop = properties(properties_file) error_list = list() #get_args() #prop = properties(properties_file) lsf = prop.lsf if (options.db_live): try: print('-' * 100) print("PostGres DB is live and accepting connection") conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost, prop.dbname, prop.dbport) print('-' * 100) except: print(sys.exc_info()) else: #print(prop) conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost, prop.dbname, prop.dbport) data_provider_list = get_list(conn) print(data_provider_list) process_jobids = {} for data_provider_stage in data_provider_list: print(data_provider_stage.process_id, data_provider_stage.selection_id, data_provider_stage.stage_list) if not data_provider_stage.check_started(conn): print("\nTo be started job: process_id:{}\ collection id: {} dataprovider id: {} ".format( data_provider_stage.process_id, data_provider_stage.selection_id, data_provider_stage.stage_list)) data_provider_stage.set_started(conn) process_dir = prop.workdir + data_provider_stage.process_id print("Creating process directory:{}".format(process_dir)) create_processing_dir(process_dir) account_name = get_datahub_names( conn, data_provider_stage.process_id) print("account to be processed:{}".format(account_name)) files = get_file_names(conn, data_provider_stage.process_id) print("Files to be downloaded:{}".format(files)) pass_word = get_datahub_account_password(conn, account_name) process_id = data_provider_stage.process_id jobids = download_datahub_file(account_name, pass_word, files, process_dir, process_id, lsf, dryrun=False) """ We should be able to capture the .err and .out lsf output into the database. Maybe define a a generic lsf_stat class, that will match in .out the "Successfully completed" string if true set length of error_list to 0 other wise logs the full path to the .out file in database """ if not lsf: #if len(error_list) != 0: if len(error_list): final_errors = '\n'.join( str(v).replace("'", "") for v in error_list) data_provider_stage.set_error(conn, final_errors) else: data_provider_stage.set_finished(conn) elif lsf: err = [ os.getcwd() + '/data_provider_' + process_id + '.' + y for y in [x + '.err' for x in jobids] ] out = [ os.getcwd() + '/data_provider_' + process_id + '.' + y for y in [x + '.out' for x in jobids] ] final_errors = '\n'.join( str(v).replace("'", "") for v in out) print(final_errors) process_jobids[process_id] = out error_list = list() if lsf: print(process_jobids) """ We should check for the content of lsmyfile.out file and store the full path of the error and out file in DB """ if lsf: for data_provider_stage in data_provider_list: process_id = data_provider_stage.process_id for lsf_out in process_jobids[process_id]: print('*' * 100) print(lsf_out) print('*' * 100) jobid = lsf_out.split('.')[-2] bsub.poll(jobid) if os.path.isfile(lsf_out): print("Processing lsmyfile.out for: jobid {}".format( jobid)) print("Processing: {}".format(lsf_out)) print('*' * 100) localexitcode = readoutfile(lsf_out, jobid) print(localexitcode) if localexitcode != 0: final_errors = lsf_out + ' with exit code ' + str( localexitcode) data_provider_stage.set_error(conn, final_errors) else: data_provider_stage.set_finished(conn) print('*' * 100) else: print("Awaiting completion of: jobid {}".format(jobid)) print("Processing: {}".format(lsf_out)) print('*' * 100) #bsub.poll(jobid) if os.path.isfile(lsf_out): localexitcode = readoutfile(lsf_out, jobid) print(localexitcode) if localexitcode != 0: final_errors = lsf_out + ' with exit code ' + str( localexitcode) data_provider_stage.set_error( conn, final_errors) else: data_provider_stage.set_finished(conn) else: bsub.poll(jobid) conn.close()