Пример #1
0
def main():
    """
	Main call to the data_provider scripts.
	:return: None
	"""
    # properties_file = get_args()
    options = process_arguments(sys.argv[1:])
    properties_file = options.properties_file
    prop = properties(properties_file)
    """ From class SelectaDB Return a propertie object """
    conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost,
                          prop.dbname, prop.dbport)
    """ From PostGreSQL return a connection object to localhost """
    workdir = prop.workdir_input
    lsf = prop.lsf
    error_list = list()
    accounts = get_datahub_accounts(conn)
    """ Accounts is list of dictionaries of users accounts and passwords """
    selections = get_selection_to_attributes_account(conn)
    print('-' * 100)
    print("SELECTION_LIST: {}".format(selections))
    print('-' * 100)
    """Selections is a list of datahub account, eg: dcc_allison """
    global log_file
    log_file = 'log.txt'
    final_errors = ''
    for account in accounts:
        print('-' * 100)
        print('Looping in account lists {}'.format(account))
        print(selections)
        print('-' * 100)
        account_id = account['account_id']
        if account_id in selections and account_id != 'dcc_fake':
            outputfile, job = fetch_datahub_metadatafile(account, workdir, lsf)
            """Get metadata per datahub accounts"""
            localinfo = "account {} has the corresponding metadatafile {} ".format(
                account, outputfile)
            print(ruler)
            print(localinfo)
            print(ruler)
            """ GET_SELECTION_INFO return a SELECTADB.class.SELECTION object
			this object is made up of selection_id, tax_id, study_acc, run_acc,
			pipeline_name, analysis_id, public, webin, datahub(eg:dcc_allison).
			These are fetched from process_selection table
			"""
            selection_all = get_selection_info(conn, account_id)
            """ """
            print('-' * 100)
            print('\n'.join(map(str, [a.__dict__ for a in selection_all])))
            print('-' * 100)

            if os.stat(outputfile).st_size == 0:
                for select in selection_all:
                    print('File size is zero .....')
                    message = "Failed to download the metadata file {} from account {}".format(
                        outputfile, account_id)
                    log_error(message, 'error', log_file)
                    error_list.append(message)
                    print(outputfile,
                          select.selection_id,
                          select.pipeline_name,
                          select.datahub,
                          select.study_accession,
                          file=sys.stdout)
                    final_errors = ' '.join(
                        str(v).replace("'", "") for v in error_list)
                    final_errors.replace("'", "")
                    # TODO: change it in a way if it doesn't find the metadata file, just log it into log file and clear the
                    # start column to be able to re-run it again
                    print(ruler)
                    print(textwrap.fill(final_errors, 250))
                    print("SELECION_ID: {} ".format(select.selection_id))
                    print(ruler)
                    set_error(conn, select.selection_id, final_errors)
            else:
                print('Metadata found with following id:{}'.format(account_id),
                      file=sys.stdout)
                try:
                    attributes_all = get_default_attributes(outputfile)
                    print('_' * 100)
                    print(
                        "Processing datahub :{}\nInitial attribute_all length {}"
                        .format(account_id, len(attributes_all)))
                    print('_' * 100)
                    all_run_accs = []
                    """ GET_DEFAULT_ATTRIBUTE, extract from the metadata tab delimited file the following information:
					tax_id	scientific_name	sample_accession	secondary_sample_accession	experiment_accession
					study_accession	secondary_study_accession	run_accession	center_name instrument_model fastq_ftp fastq_md5
					attributes_all is a list containing all the data for a given dcc_user, with process_id,selection_id,datahub set to ''
					since the latter are missing from the metadata file.
					"""
                except:
                    # TODO in case the study exist, we shouldn't through database error and just log it and clear the
                    # satrt column to be able to re-run it again
                    print("ERROR: Cannot process {} file".format(outputfile),
                          file=sys.stderr)
                    # message="Exception: "+str(sys.exc_info()[1])
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    err_lines = traceback.format_exception(
                        exc_type, exc_value, exc_traceback)
                    message = '\n'.join(err_lines)
                    print("message:{}".format(message))
                    error_list.append(message)
                    error_list.append(
                        "ERROR: Cannot process {} file".format(outputfile))
                    print("Exception: {}".format(message), file=sys.stderr)

                for select in selection_all:
                    """ selection_all= selection_id, tax_id, study_acc, run_acc,
						pipeline_name, analysis_id, public, webin, selection_to_attribute_end, process_type, continuity, datahub(eg:dcc_allison).
						These are fetch from process_selection.
						What is the processing type:
					"""
                    print('-' * 100)
                    print(
                        "Processing Selection_id:{}\nwith process_type:{}\nand continuity:{}"
                        .format(select.selection_id, select.process_type,
                                select.continuity))
                    print('-' * 100)
                    processing_type = select.process_type
                    continuity = select.continuity
                    selection_to_attribute_end = select.selection_to_attribute_end
                    if not error_list:
                        print(
                            "\n\nThe processing of the metadata with project:{} datahub:{} had zero errors:\n"
                            .format(select.study_accession, select.datahub))

                        try:
                            if not check_started(conn, select.selection_id):
                                """"
								Check_started : return true or false for selection_process_start
								ie Is selection_id from process_selection has (selection_to_attribute_start null)
								"""
                                set_started(conn, select.selection_id)
                                """"
								set_started: set selection_to_attribute_start to now() for selection_id in process_selection
								Here we have to make use of the continuity value in select object to proceed further
								"""
                                #study_to_run_acc = list()
                                #datahub_to_run_acc = list()
                                #run_to_run_acc = list()

                                if continuity.lower() == "yes":
                                    """
									update the process_stage and process_report table for
									subset of run_id not already processed
									"""
                                    print("CONTINUITY YES met")
                                    already_ran_run_accs = already_ran_runs(
                                        conn, select.selection_id,
                                        processing_type)
                                    print("Already ran runs: {}".format(
                                        len(already_ran_run_accs)))
                                    """ Excluded already processed runs if any """
                                    print([
                                        attr.run_accession
                                        for attr in attributes_all
                                    ][1:10])
                                    attributes_all = exclude_processed_run(
                                        select, attributes_all,
                                        already_ran_run_accs)
                                    print('*' * 100)
                                    print([
                                        attr.run_accession
                                        for attr in attributes_all
                                    ][1:10])
                                    print('*' * 100)
                                    print('*' * 100)
                                    print("New attribute_all length {}".format(
                                        len(attributes_all)))
                                    print(attributes_all)
                                    print('*' * 100)

                                for attr in attributes_all:
                                    """ Select runs under study_id in process selection """
                                    if processing_type.lower() == 'study' and \
                                        select.study_accession.strip() == attr.study_accession.strip():
                                        report_process = Process_report(
                                            select, attr, error_list)
                                        report_process.log_process_report_info(
                                            conn)
                                    elif processing_type.lower() == 'run' and \
                                        select.run_accession.strip() == attr.run_accession.strip():
                                        print("*" * 100)
                                        print(processing_type)
                                        print("*" * 100)
                                        all_run_accs = [
                                            attr.run_accession
                                            for attr in attributes_all
                                            if select.run_accession.strip() ==
                                            attr.run_accession.strip()
                                        ]
                                        """ Select specific run id for processing """
                                        report_process = Process_report(
                                            select, attr, error_list)
                                        report_process.log_process_report_info(
                                            conn)
                                    elif processing_type.lower() == 'datahub':
                                        all_run_accs = [
                                            attr.run_accession
                                            for attr in attributes_all
                                        ]
                                        """ Select all runs under the dcc_hub account for processing """
                                        report_process = Process_report(
                                            select, attr, error_list)
                                        report_process.log_process_report_info(
                                            conn)

                        except:
                            print(
                                "ERROR: Cannot process selection_id {}".format(
                                    select.selection_id),
                                file=sys.stderr)
                            #message = str(sys.exc_info()[1])
                            message = str(sys.exc_info())
                            error_list.append(message)
                            print("Exception: {}".format(message),
                                  file=sys.stderr)
                            final_errors = ' '.join(
                                str(v).replace("'", "") for v in error_list)
                            print('*' * 100)
                            print(final_errors)
                            print('*' * 100)
                            set_error(conn, select.selection_id,
                                      final_errors.replace("'", ""))
                            error_list = list()

                    else:
                        final_errors = ' '.join(
                            str(v).replace("'", "") for v in error_list)
                        print("ERRORs: {}".format(final_errors),
                              file=sys.stderr)
                        set_error(conn, select.selection_id,
                                  final_errors.replace("'", ""))
                    print('-' * 100)
                    """ Amend various SELECTA table with metadata from each SELECTION_ID"""

                    print('SELECT From SELECTION_ALL next Loop')
                error_list = list()
    conn.close()
Пример #2
0
        post_submission_error = post_submission(submission_error_messages,
                                                returncode, out, err)
        if post_submission_error != '':
            #pass
            error_list.append(post_submission_error)
        terminate(conn, analysis_reporter_stage, extract_analysis_id(out),
                  extract_submission_id(out))
        error_list = list()
        return True
    else:
        return False


if __name__ == '__main__':
    get_args()
    prop = properties(properties_file)
    print('-' * 100)
    print('-' * 100)
    conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost,
                          prop.dbname, prop.dbport)
    print('-' * 100)
    print("Pipeline_version:{}".format(prop.uantwerp_bacpipe_version))
    print("analysis_submission_url_dev:", prop.analysis_submission_url_dev)
    print("analysis_submission_url_prod:", prop.analysis_submission_url_prod)
    print("analysis_submission_action:", prop.analysis_submission_action)
    print("analysis_submission_mode:", prop.analysis_submission_mode)
    print("DB_NAME: {}".format(prop.dbname))
    print("DB_HOST: {}".format(prop.dbhost))
    print('-' * 100)
    analysis_reporter_list = get_list(conn)
    error_list = list()
Пример #3
0
def main():
    """
    Main call to the data_provider scripts.
    :return: None
    """
    #
    options = process_arguments(sys.argv[1:])
    properties_file = options.properties_file
    prop = properties(properties_file)

    error_list = list()
    #get_args()
    #prop = properties(properties_file)
    lsf = prop.lsf
    if (options.db_live):
        try:
            print('-' * 100)
            print("PostGres DB is live and accepting connection")
            conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost,
                                  prop.dbname, prop.dbport)
            print('-' * 100)
        except:
            print(sys.exc_info())
    else:
        #print(prop)
        conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost,
                              prop.dbname, prop.dbport)
        data_provider_list = get_list(conn)
        print(data_provider_list)
        process_jobids = {}
        for data_provider_stage in data_provider_list:
            print(data_provider_stage.process_id,
                  data_provider_stage.selection_id,
                  data_provider_stage.stage_list)
            if not data_provider_stage.check_started(conn):
                print("\nTo be started job: process_id:{}\
                 collection id: {} dataprovider id: {} ".format(
                    data_provider_stage.process_id,
                    data_provider_stage.selection_id,
                    data_provider_stage.stage_list))
                data_provider_stage.set_started(conn)
                process_dir = prop.workdir + data_provider_stage.process_id
                print("Creating process directory:{}".format(process_dir))
                create_processing_dir(process_dir)
                account_name = get_datahub_names(
                    conn, data_provider_stage.process_id)
                print("account to be processed:{}".format(account_name))
                files = get_file_names(conn, data_provider_stage.process_id)
                print("Files to be downloaded:{}".format(files))
                pass_word = get_datahub_account_password(conn, account_name)
                process_id = data_provider_stage.process_id
                jobids = download_datahub_file(account_name,
                                               pass_word,
                                               files,
                                               process_dir,
                                               process_id,
                                               lsf,
                                               dryrun=False)
                """
                We should be able to capture the .err and .out lsf output into the
                database. Maybe define a a generic lsf_stat class, that will match in
                .out the "Successfully completed" string if true set length of error_list to 0
                other wise logs the full path to the .out file in database
                """
                if not lsf:
                    #if len(error_list) != 0:
                    if len(error_list):
                        final_errors = '\n'.join(
                            str(v).replace("'", "") for v in error_list)
                        data_provider_stage.set_error(conn, final_errors)
                    else:
                        data_provider_stage.set_finished(conn)
                elif lsf:
                    err = [
                        os.getcwd() + '/data_provider_' + process_id + '.' + y
                        for y in [x + '.err' for x in jobids]
                    ]
                    out = [
                        os.getcwd() + '/data_provider_' + process_id + '.' + y
                        for y in [x + '.out' for x in jobids]
                    ]
                    final_errors = '\n'.join(
                        str(v).replace("'", "") for v in out)
                    print(final_errors)
                    process_jobids[process_id] = out
            error_list = list()
            if lsf:
                print(process_jobids)
        """
        We should check for the content of lsmyfile.out file and store the 
        full path of the error and out file in DB
        """
        if lsf:
            for data_provider_stage in data_provider_list:
                process_id = data_provider_stage.process_id
                for lsf_out in process_jobids[process_id]:
                    print('*' * 100)
                    print(lsf_out)
                    print('*' * 100)
                    jobid = lsf_out.split('.')[-2]
                    bsub.poll(jobid)
                    if os.path.isfile(lsf_out):
                        print("Processing lsmyfile.out for: jobid {}".format(
                            jobid))
                        print("Processing: {}".format(lsf_out))
                        print('*' * 100)
                        localexitcode = readoutfile(lsf_out, jobid)
                        print(localexitcode)
                        if localexitcode != 0:
                            final_errors = lsf_out + ' with exit code ' + str(
                                localexitcode)
                            data_provider_stage.set_error(conn, final_errors)
                        else:
                            data_provider_stage.set_finished(conn)
                        print('*' * 100)
                    else:
                        print("Awaiting completion of: jobid {}".format(jobid))
                        print("Processing: {}".format(lsf_out))
                        print('*' * 100)
                        #bsub.poll(jobid)
                        if os.path.isfile(lsf_out):
                            localexitcode = readoutfile(lsf_out, jobid)
                            print(localexitcode)
                            if localexitcode != 0:
                                final_errors = lsf_out + ' with exit code ' + str(
                                    localexitcode)
                                data_provider_stage.set_error(
                                    conn, final_errors)
                            else:
                                data_provider_stage.set_finished(conn)
                        else:
                            bsub.poll(jobid)

        conn.close()