예제 #1
0
import gzip
import hashlib
import logging
import os
import pandas as pd
import sys

from sample_metadata.utils import get_joined_metadata_df, get_gtex_rnaseq_sample_metadata_df

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

#%%

df = get_joined_metadata_df()
#%%

df.columns

#%%

df[['proj (seqr)', 'imputed tissue']].reset_index().groupby(
    ('imputed tissue')).count()

#%%

df = df[[
    'proj (seqr)', 'imputed tissue', 'imputed sex', 'read length (rnaseqc)',
    'batch_date_from_hg19_bam_header', 'star_pipeline_batch'
]].reset_index()
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()

    p = argparse.ArgumentParser()
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("--local", action="store_true", help="Batch: run locally")
    grp.add_argument("--cluster",
                     action="store_true",
                     help="Batch: submit to cluster")
    p.add_argument(
        "--batch-billing-project",
        default="tgg-rare-disease",
        help="Batch: billing project. Required if submitting to cluster.")
    p.add_argument("--batch-job-name", help="Batch: (optional) job name")

    p.add_argument(
        "-f",
        "--force",
        action="store_true",
        help="Recompute and overwrite cached or previously computed data")
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("-b",
                     "--rnaseq-batch-name",
                     nargs="*",
                     help="RNA-seq batch names to process",
                     choices=set(
                         rnaseq_sample_metadata_df['star_pipeline_batch']))
    grp.add_argument("-s",
                     "--rnaseq-sample-id",
                     nargs="*",
                     help="RNA-seq sample IDs to process",
                     choices=set(rnaseq_sample_metadata_df['sample_id']))
    args = p.parse_args()

    #logger.info("\n".join(df.columns))

    if args.rnaseq_batch_name:
        batch_names = args.rnaseq_batch_name
        sample_ids = rnaseq_sample_metadata_df[rnaseq_sample_metadata_df[
            'star_pipeline_batch'].isin(batch_names)].sample_id
    elif args.rnaseq_sample_id:
        sample_ids = args.rnaseq_sample_id

    logger.info(
        f"Processing {len(sample_ids)} sample ids: {', '.join(sample_ids)}")

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    if args.local:
        backend = hb.LocalBackend(gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    else:
        backend = hb.ServiceBackend(args.batch_billing_project)

    b = hb.Batch(backend=backend, name=args.batch_job_name)

    # define workflow inputs
    if args.local:
        genes_gtf = b.read_input("gencode.v26.annotation.gff3",
                                 extension=".gff3")
    else:
        genes_gtf = b.read_input(
            "gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.GRCh38.gff3",
            extension=".gff3")

    # define parallel execution for samples
    for sample_id in sample_ids:
        metadata_row = rnaseq_sample_metadata_df.loc[sample_id]
        batch_name = metadata_row['star_pipeline_batch']

        # set job inputs & outputs
        input_read_data = b.read_input_group(
            bam=metadata_row['star_bam'],
            bai=metadata_row['star_bai'],
        )

        output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/majiq_build/"
        output_file_path = os.path.join(output_dir,
                                        f"majiq_build_{sample_id}.tar.gz")

        # check if output file already exists
        if hl.hadoop_is_file(output_file_path) and not args.force:
            logger.info(
                f"{sample_id} output file already exists: {output_file_path}. Skipping..."
            )
            continue

        file_stats = hl.hadoop_stat(metadata_row['star_bam'])
        bam_size = int(round(file_stats['size_bytes'] / 10.**9))

        # define majiq build commands for this sample
        j = b.new_job(name=args.batch_job_name)
        j.image("weisburd/majiq:latest")
        j.storage(f'{bam_size*3}Gi')
        j.cpu(1)  # default: 1
        j.memory("15G")  # default: 3.75G
        logger.info(
            f'Requesting: {j._storage or "default"} storage, {j._cpu or "default"} CPU, {j._memory or "default"} memory'
        )

        # switch to user account
        j.command(
            f"gcloud auth activate-service-account --key-file /gsa-key/key.json"
        )
        j.command(
            f"gsutil -m cp -r {GCLOUD_CREDENTIALS_LOCATION}/.config /tmp/")
        j.command(f"rm -rf ~/.config")
        j.command(f"mv /tmp/.config ~/")
        j.command(f"gcloud config set account {GCLOUD_USER_ACCOUNT}")
        j.command(f"gcloud config set project {GCLOUD_PROJECT}")

        # run majiq build
        #j.command(f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/GENCODE/gencode.v26.GRCh38.ERCC.genes.collapsed_only.gtf .")
        j.command(f"mv {genes_gtf} gencode.gff3")
        j.command(f"mv {input_read_data.bam} {sample_id}.bam")
        j.command(f"mv {input_read_data.bai} {sample_id}.bam.bai")

        j.command(f"echo '[info]' >> majiq_build.cfg")
        j.command(
            f"echo 'readlen={metadata_row['read length (rnaseqc)']}' >> majiq_build.cfg"
        )
        j.command(f"echo 'bamdirs=.' >> majiq_build.cfg")
        j.command(f"echo 'genome=hg38' >> majiq_build.cfg")
        j.command(
            f"echo 'strandness={'None' if metadata_row['stranded? (rnaseqc)'] == 'no' else 'reverse'}' >> majiq_build.cfg"
        )
        j.command(f"echo '[experiments]' >> majiq_build.cfg")
        j.command(f"echo '{sample_id}={sample_id}' >> majiq_build.cfg")

        j.command(f"cat majiq_build.cfg >> {j.logfile}")
        j.command(
            f"majiq build gencode.gff3 -c majiq_build.cfg -j 1 -o majiq_build_{sample_id} >> {j.logfile}"
        )

        j.command(
            f"tar czf majiq_build_{sample_id}.tar.gz majiq_build_{sample_id}")
        j.command(f"cp majiq_build_{sample_id}.tar.gz {j.output_tar_gz}")

        #j.command(f"ls -lh . >> {j.logfile}")
        #j.command(f"echo ls majiq_build_{sample_id} >> {j.logfile}")
        #j.command(f"ls -1 majiq_build_{sample_id} >> {j.logfile}")
        j.command(f"echo --- done  {output_file_path} >> {j.logfile}")

        # copy output
        b.write_output(j.output_tar_gz, output_file_path)
        b.write_output(
            j.logfile, os.path.join(output_dir,
                                    f"majiq_build_{sample_id}.log"))

    b.run()

    if isinstance(backend, hb.ServiceBackend):
        backend.close()
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc.
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            input_bam, input_bai = metadata_row['bam_path'], metadata_row[
                'bai_path']
            output_dir = metadata_row['output_dir']

            print("Input bam: ", input_bam)
            output_filename = f"{sample_id}.bigWig"
            output_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                )
                continue

            file_stats = hl.hadoop_stat(metadata_row['bam_path'])
            bam_size = int(round(file_stats['size_bytes'] / 10.**9))
            disk_size = bam_size * 2

            j = batch_utils.init_job(batch,
                                     f"bam=>bigWig: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=disk_size,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ."
            )
            j.command(f"touch {sample_id}.bam.bai")

            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}"
            )
            j.command(f"cp {output_filename} {j.output_bigWig}")
            j.command(f"echo Done: {output_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bigWig, output_file_path)

            print("Output file path: ", output_file_path)
예제 #4
0
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, star_SJ_out_tab, output_dir, batch_name
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            output_dir = metadata_row['output_dir']

            print("Input file: ", metadata_row['star_SJ_out_tab'])
            output_filename = f"{sample_id}.junctions.bed.gz"
            output_bed_gz_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_bed_gz_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_bed_gz_file_path}. Skipping..."
                )
                continue

            j = batch_utils.init_job(batch,
                                     name=f"tab=>bed: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=5,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {metadata_row['star_SJ_out_tab']} ."
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.gff3.gz ."
            )
            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /convert_SJ_out_tab_to_junctions_bed.py -g gencode.v26.annotation.gff3.gz {os.path.basename(metadata_row['star_SJ_out_tab'])}"
            )
            j.command(f"cp {output_filename} {j.output_bed_gz}")
            j.command(f"cp {output_filename}.tbi {j.output_bed_gz_tbi}")
            j.command(f"echo Done: {output_bed_gz_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bed_gz, output_bed_gz_file_path)
            batch.write_output(j.output_bed_gz_tbi,
                               f"{output_bed_gz_file_path}.tbi")

            print("Output file path: ", output_bed_gz_file_path)