示例#1
0
def main():
    args = parse_args()
    init_log(args.log)
    with open(args.app) as app_file:
        # parse and validate the requested data application JSON file
        application = Application(app_file)
        logging.info("Input data application parsed: {}".format(args.app))
        # Create output directory for the results
        application_dir = create_app_dir(application)
        # check what data types are allowed for this application
        allowed_data_types = application.allowed_data_types()
        logging.info("Allowed data types: {}".format(
            ' '.join(allowed_data_types)))
        if len(allowed_data_types) > 0:
            # Get all the sample metadata for all requested cohorts
            requested_cohorts = application.cohorts()
            metadata = Metadata(args.data, requested_cohorts)
            logging.info("Metadata collected for requested cohorts: {}".format(
                ' '.join(requested_cohorts)))
            metadata_sample_ids = sorted(metadata.get_sample_ids())
            logging.info("Metadata for sample IDs: {}".format(
                ' '.join(metadata_sample_ids)))
            # Filter the sample metadata based on patient consent
            metadata.filter_consent(args.consent, allowed_data_types)
            logging.warning("Consent not handled yet. FIXME")
            # Find all the file paths for requested file types for each
            # consented sample
            requested_file_types = application.file_types()
            logging.info("Requested file types: {}".format(
                ' '.join(requested_file_types)))
            fastqs, bams, bais, vcfs = get_files(args.data,
                                                 requested_file_types,
                                                 metadata)
            logging.info("VCF files selected:\n{}".format('\n'.join(vcfs)))
            logging.info("BAM files selected:\n{}".format('\n'.join(bams)))
            logging.info("BAI files selected:\n{}".format('\n'.join(bais)))
            logging.info("FASTQ files selected:\n{}".format('\n'.join(fastqs)))
            output_files = []
            if 'Anonymised' in allowed_data_types:
                # generate random IDs for all output samples
                randomised_ids = make_random_ids(args.usedids,
                                                 metadata.sample_ids)
                metadata.anonymise(randomised_ids)
                metadata.write(args.metaout)
                logging.info("Anonymised metadata written to: {}".format(
                    args.metaout))
                new_vcfs = anonymise_files(vcfs, randomised_ids,
                                           application_dir, VCF_filename,
                                           vcf_edit)
                new_bams = anonymise_files(bams, randomised_ids,
                                           application_dir, BAM_filename,
                                           bam_edit)
                # BAIs and FASTQs are just sym-linked to output with randomised name
                new_bais = anonymise_files(bais, randomised_ids,
                                           application_dir, BAI_filename)
                new_fastqs = anonymise_files(fastqs, randomised_ids,
                                             application_dir, FASTQ_filename)
                output_files.extend(new_vcfs + new_bams + new_bais +
                                    new_fastqs)
                logging.info("Output files are anonymised")
            elif 'Re-identifiable' in allowed_data_types:
                new_links = link_files(application_dir,
                                       vcfs + bams + bais + fastqs)
                output_files.extend(new_links)
                logging.info(
                    "Files linked in directory: {}".format(application_dir))
                metadata.write(args.metaout)
                logging.info("Output files are re-identifiable")
            else:
                print_error(
                    "Allowed data is neither anonymised nor re-identifiable")
                exit(ERROR_BAD_ALLOWED_DATA)
            logging.info("Generating MD5 checksums on output files")
            md5_files(args.md5, output_files)
        else:
            logging.warning("No data available for this application")