Exemplo n.º 1
0
def plot_kernel_density(infile, output, tempdir, sep, colname, plot_title,
                        multipliers):

    helpers.makedirs(tempdir)

    multiplier_pdfs = []

    for multiplier in multipliers:

        multiplier_output = os.path.join(tempdir, "{}.pdf".format(multiplier))

        multiplier_pdfs.append(multiplier_output)

        mult_plot_title = '{}({})'.format(plot_title, multiplier)

        tablename = '/hmmcopy/metrics/{}'.format(multiplier)

        plot = PlotKernelDensity(infile,
                                 multiplier_output,
                                 sep,
                                 colname,
                                 mult_plot_title,
                                 tablename=tablename)
        plot.main()

    pdfutils.merge_pdfs(multiplier_pdfs, output)
Exemplo n.º 2
0
def merge_pdf(in_filenames, outfilenames, metrics, cell_filters, tempdir,
              labels):

    helpers.makedirs(tempdir)

    good_cells = get_good_cells(metrics, cell_filters, '/hmmcopy/metrics/0')

    grouped_data = group_cells_by_row(good_cells,
                                      metrics,
                                      '/hmmcopy/metrics/0',
                                      sort_by_col=True)

    for infiles, outfiles, label in zip(in_filenames, outfilenames, labels):

        extension = os.path.splitext(infiles[good_cells[0]])[-1]

        plotdir = os.path.join(tempdir, label)

        helpers.makedirs(plotdir)

        for cell in good_cells:
            shutil.copyfile(
                infiles[cell],
                os.path.join(plotdir, cell + "_" + label + extension))

        helpers.make_tarfile(outfiles, plotdir)
Exemplo n.º 3
0
def organism_filter(
        fastq_r1, fastq_r2, filtered_fastq_r1, filtered_fastq_r2,
        detailed_metrics, summary_metrics, tempdir, cell_id, params,
        reference, docker_image=None, filter_contaminated_reads=False,
):
    # fastq screen tries to skip if files from old runs are available
    if os.path.exists(tempdir):
        shutil.rmtree(tempdir)

    helpers.makedirs(tempdir)

    tagged_fastq_r1, tagged_fastq_r2 = run_fastq_screen_paired_end(
        fastq_r1, fastq_r2, tempdir, params, docker_image=docker_image
    )

    reader = fastqutils.PairedTaggedFastqReader(tagged_fastq_r1, tagged_fastq_r2)
    counts = reader.gather_counts()

    write_detailed_counts(counts, detailed_metrics, cell_id)
    write_summary_counts(counts, summary_metrics, cell_id)

    if filter_contaminated_reads:
        filter_reads(
            tagged_fastq_r1, tagged_fastq_r2, filtered_fastq_r1,
            filtered_fastq_r2, reference
        )
    else:
        # use the full tagged fastq downstream
        # with organism type information in readname
        re_tag_reads(tagged_fastq_r1, filtered_fastq_r1)
        re_tag_reads(tagged_fastq_r2, filtered_fastq_r2)
Exemplo n.º 4
0
def run_fastqc(fastq1, fastq2, reports, tempdir, config):
    """
    run fastqc on both fastq files
    run trimgalore if needed, copy if not.
    """
    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'fastqc',
                                              docker_only=True)

    reports_dir = os.path.join(tempdir, 'fastqc_reports')
    if not os.path.exists(reports_dir):
        helpers.makedirs(reports_dir)

    out_html = os.path.join(reports_dir, 'fastqc_R1.html')
    out_plot = os.path.join(reports_dir, 'fastqc_R1.zip')
    if not os.path.getsize(fastq1) == 0:
        bamutils.produce_fastqc_report(fastq1, out_html, out_plot, tempdir,
                                       **container_ctx)
    else:
        warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1)

    out_html = os.path.join(reports_dir, 'fastqc_R2.html')
    out_plot = os.path.join(reports_dir, 'fastqc_R2.zip')
    if not os.path.getsize(fastq2) == 0:
        bamutils.produce_fastqc_report(fastq2, out_html, out_plot, tempdir,
                                       **container_ctx)
    else:
        warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1)

    helpers.make_tarfile(reports, reports_dir)
Exemplo n.º 5
0
def run_hmmcopy(
        bam_file,
        corrected_reads_filename,
        segments_filename,
        parameters_filename,
        metrics_filename,
        hmmcopy_tar,
        cell_id,
        hmmparams,
        tempdir,
        docker_image
):

    # generate wig file for hmmcopy
    helpers.makedirs(tempdir)
    readcount_wig = os.path.join(tempdir, 'readcounter.wig')
    corrected_reads = os.path.join(tempdir, 'corrected_reads.csv')

    run_correction_hmmcopy(
        bam_file,
        corrected_reads,
        readcount_wig,
        hmmparams,
        docker_image
    )

    hmmcopy_tempdir = os.path.join(tempdir, '{}_hmmcopy'.format(cell_id))
    helpers.makedirs(hmmcopy_tempdir)

    run_hmmcopy_script(
        corrected_reads,
        hmmcopy_tempdir,
        cell_id,
        hmmparams,
        docker_image
    )

    hmmcopy_outdir = os.path.join(hmmcopy_tempdir, str(0))
    
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "reads.csv"), corrected_reads_filename,
        dtypes=dtypes()['reads']
    )
    
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "params.csv"), parameters_filename,
        dtypes=dtypes()['params']
    )
 
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "segs.csv"), segments_filename,
        dtypes=dtypes()['segs']
    )
    
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "metrics.csv"), metrics_filename,
        dtypes=dtypes()['metrics']
    )

    helpers.make_tarfile(hmmcopy_tar, hmmcopy_tempdir)
def bam_collect_gc_metrics(bam_filename,
                           ref_genome,
                           metrics_filename,
                           summary_filename,
                           chart_filename,
                           tempdir,
                           mem="2G",
                           docker_image=None):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute('picard',
                                  '-Xmx' + mem,
                                  '-Xms' + mem,
                                  '-XX:ParallelGCThreads=1',
                                  'CollectGcBiasMetrics',
                                  'INPUT=' + bam_filename,
                                  'OUTPUT=' + metrics_filename,
                                  'REFERENCE_SEQUENCE=' + ref_genome,
                                  'S=' + summary_filename,
                                  'CHART_OUTPUT=' + chart_filename,
                                  'VALIDATION_STRINGENCY=LENIENT',
                                  'TMP_DIR=' + tempdir,
                                  'MAX_RECORDS_IN_RAM=150000',
                                  docker_image=docker_image)
def bam_collect_wgs_metrics(bam_filename,
                            ref_genome,
                            metrics_filename,
                            config,
                            tempdir,
                            mem="2G",
                            docker_image=None):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard',
        '-Xmx' + mem,
        '-Xms' + mem,
        '-XX:ParallelGCThreads=1',
        'CollectWgsMetrics',
        'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename,
        'REFERENCE_SEQUENCE=' + ref_genome,
        'MINIMUM_BASE_QUALITY=' + str(config['min_bqual']),
        'MINIMUM_MAPPING_QUALITY=' + str(config['min_mqual']),
        'COVERAGE_CAP=500',
        'VALIDATION_STRINGENCY=LENIENT',
        'COUNT_UNPAIRED=' + ('True' if config['count_unpaired'] else 'False'),
        'TMP_DIR=' + tempdir,
        'MAX_RECORDS_IN_RAM=150000',
        docker_image=docker_image)
Exemplo n.º 8
0
def run_lumpy(tumour_disc, tumour_split, tumour_hist, tumour_mean_stdev,
              tumour_id, normal_disc, normal_split, normal_hist,
              normal_mean_stdev, normal_id, vcf, tempdir):
    tumour_mean, tumour_stdev = load_metadata(tumour_mean_stdev)
    normal_mean, normal_stdev = load_metadata(normal_mean_stdev)

    helpers.makedirs(tempdir)
    tempdir = tempdir + '/lumpy'

    tumour_pe = 'id:{},bam_file:{},histo_file:{},mean:{},' \
                'stdev:{},read_length:101,min_non_overlap:101,' \
                'discordant_z:5,back_distance:10,weight:1,' \
                'min_mapping_threshold:20'.format(tumour_id, tumour_disc, tumour_hist, tumour_mean, tumour_stdev)
    tumour_sr = 'id:{},bam_file:{},back_distance:10,weight:1,' \
                'min_mapping_threshold:20'.format(tumour_id, tumour_split)

    normal_pe = 'id:{},bam_file:{},histo_file:{},mean:{},' \
                'stdev:{},read_length:101,min_non_overlap:101,' \
                'discordant_z:5,back_distance:10,weight:1,' \
                'min_mapping_threshold:20'.format(normal_id, normal_disc, normal_hist, normal_mean, normal_stdev)
    normal_sr = 'id:{},bam_file:{},back_distance:10,weight:1,' \
                'min_mapping_threshold:20'.format(normal_id, normal_split)

    cmd = [
        'lumpy', '-e', '-b', '-mw', 4, '-tt', 0, '-pe', tumour_pe, '-sr',
        tumour_sr, '-pe', normal_pe, '-sr', normal_sr, '-t', tempdir, '>', vcf
    ]

    pypeliner.commandline.execute(*cmd)
Exemplo n.º 9
0
def split_bam_file_by_reads(bam, bai, outbams, outbais, tempspace, intervals, kwargs):
    # sort bam by reads and convert to sam

    helpers.makedirs(tempspace)

    headerfile = os.path.join(tempspace, "bam_header.sam")

    cmd = ['samtools', 'view', '-H', bam, '-o', headerfile]
    pypeliner.commandline.execute(*cmd, **kwargs)

    collate_prefix = os.path.join(
        tempspace, os.path.basename(bam) + "_collate_temp"
    )
    collated_bam = os.path.join(tempspace, "bam_file_collated_sam_format.sam")

    cmd = [
        'samtools', 'collate', '-u', '-O', bam, collate_prefix, '|',
        'samtools', 'view', '-', '-o', collated_bam
    ]

    pypeliner.commandline.execute(*cmd, **kwargs)

    tempoutputs = [
        os.path.join(tempspace, os.path.basename(outbams[interval]) + ".split.temp")
        for interval in intervals
    ]

    split(collated_bam, tempoutputs, headerfile=headerfile)

    for inputsam, interval in zip(tempoutputs, intervals):
        outputbam = outbams[interval]

        cmd = ['samtools', 'view', '-Sb', inputsam, '-o', outputbam]

        pypeliner.commandline.execute(*cmd, **kwargs)
Exemplo n.º 10
0
def run_fastqc(fastq1, fastq2, reports, tempdir, containers):
    """
    run fastqc on both fastq files
    run trimgalore if needed, copy if not.
    """
    reports_dir = os.path.join(tempdir, 'fastqc_reports')
    if not os.path.exists(reports_dir):
        helpers.makedirs(reports_dir)

    # empty fastq files
    if os.stat(fastq1).st_size < 100 and os.stat(fastq2).st_size < 100:
        helpers.make_tarfile(reports, reports_dir)
        return

    out_html = os.path.join(reports_dir, 'fastqc_R1.html')
    out_plot = os.path.join(reports_dir, 'fastqc_R1.zip')
    if not os.path.getsize(fastq1) == 0:
        bamutils.produce_fastqc_report(fastq1, out_html, out_plot, tempdir,
                                       docker_image=containers['fastqc'])
    else:
        logging.getLogger("single_cell.align.tasks").warn(
            "fastq file %s is empty, skipping fastqc" % fastq1)

    out_html = os.path.join(reports_dir, 'fastqc_R2.html')
    out_plot = os.path.join(reports_dir, 'fastqc_R2.zip')
    if not os.path.getsize(fastq2) == 0:
        bamutils.produce_fastqc_report(fastq2, out_html, out_plot, tempdir,
                                       docker_image=containers['fastqc'])
    else:
        logging.getLogger("single_cell.align.tasks").warn(
            "fastq file %s is empty, skipping fastqc" % fastq1)

    helpers.make_tarfile(reports, reports_dir)
Exemplo n.º 11
0
def cell_cycle_classifier(hmmcopy_reads,
                          hmmcopy_metrics,
                          alignment_metrics,
                          output,
                          tempdir,
                          docker_image=None):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)

    cell_cycle_df = pd.read_csv(temp_output)

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output)
Exemplo n.º 12
0
def generate_pipeline_config_in_temp(args):

    if args['which'] in ['clean_sentinels', 'generate_config']:
        return args

    if args.get("config_file", None):
        return args

    config_yaml = "config.yaml"
    tmpdir = args.get("tmpdir", None)
    pipelinedir = args.get("pipelinedir", None)

    # use pypeliner tmpdir to store yaml
    if pipelinedir:
        config_yaml = os.path.join(pipelinedir, config_yaml)
    elif tmpdir:
        config_yaml = os.path.join(tmpdir, config_yaml)
    else:
        warnings.warn("no tmpdir specified, generating configs in working dir")
        config_yaml = os.path.join(os.getcwd(), config_yaml)

    config_yaml = helpers.get_incrementing_filename(config_yaml)

    params_override = args["config_override"]

    helpers.makedirs(config_yaml, isfile=True)

    config_params = pipeline_config.get_config_params(override=params_override)
    config = pipeline_config.get_singlecell_pipeline_config(config_params)
    pipeline_config.write_config(config, config_yaml)

    args["config_file"] = config_yaml

    return args
Exemplo n.º 13
0
def add_quality(hmmcopy_metrics, alignment_metrics, multipliers, output, training_data, tempdir):
    helpers.makedirs(tempdir)

    hmmcopy_tables = ['/hmmcopy/metrics/{}'.format(mult) for mult in multipliers]

    model = classify.train_classifier(training_data)

    feature_names = model.feature_names_

    data = classify.load_data(hmmcopy_metrics, alignment_metrics,
                              hmmcopy_tables, '/alignment/metrics',
                              feature_names)

    for i, (hmmcopy_table, tabledata) in enumerate(data):
        intermediate_output = os.path.join(
            tempdir, '{}_metrics_with_quality.csv.gz'.format(i)
        )

        predictions = classify.classify(model, tabledata)

        classify.write_to_output(
            hmmcopy_metrics,
            hmmcopy_table,
            intermediate_output,
            predictions)

        csvutils.prep_csv_files(intermediate_output, output, dtypes=dtypes()['metrics'])
Exemplo n.º 14
0
def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir,
                          **kwargs):
    makedirs(temp_dir)

    pypeliner.commandline.execute(
        'fastqc',
        '--outdir=' + temp_dir,
        fastq_filename,
        **kwargs)

    fastq_basename = os.path.basename(fastq_filename)
    if fastq_basename.endswith(".fastq.gz"):
        fastq_basename = fastq_basename.replace(".fastq.gz", "")
    elif fastq_basename.endswith(".fq.gz"):
        fastq_basename = fastq_basename.replace(".fq.gz", "")
    elif fastq_basename.endswith(".fq"):
        fastq_basename = fastq_basename.replace(".fq", "")
    elif fastq_basename.endswith(".fastq"):
        fastq_basename = fastq_basename.replace(".fastq", "")
    else:
        raise Exception("Unknown file type")

    output_basename = os.path.join(temp_dir, fastq_basename)

    shutil.move(output_basename + '_fastqc.zip', output_plots)
    shutil.move(output_basename + '_fastqc.html', output_html)
Exemplo n.º 15
0
def create_chromosome_seqdata(seqdata,
                              bam_file,
                              tempdir,
                              config,
                              ref_data_dir,
                              chromosomes=None):
    helpers.makedirs(tempdir)
    if not chromosomes:
        chromosomes = remixt.config.get_chromosomes(config, ref_data_dir)

    snp_positions_filename = remixt.config.get_filename(
        config, ref_data_dir, 'snp_positions')

    all_seqdata = {}

    bam_max_fragment_length = remixt.config.get_param(
        config, 'bam_max_fragment_length')
    bam_max_soft_clipped = remixt.config.get_param(config,
                                                   'bam_max_soft_clipped')
    bam_check_proper_pair = remixt.config.get_param(config,
                                                    'bam_check_proper_pair')

    for chrom in chromosomes:
        chrom_seqdata = os.path.join(tempdir, "{}_seqdata.h5".format(chrom))
        all_seqdata[chrom] = chrom_seqdata

        remixt.seqdataio.create_chromosome_seqdata(chrom_seqdata, bam_file,
                                                   snp_positions_filename,
                                                   chrom,
                                                   bam_max_fragment_length,
                                                   bam_max_soft_clipped,
                                                   bam_check_proper_pair)

    remixt.seqdataio.merge_seqdata(seqdata, all_seqdata)
Exemplo n.º 16
0
def trim_fastqs(fastq1, fastq2, cell_id, tempdir, config):
    """
    run fastqc on both fastq files
    run trimgalore if needed, copy if not.
    """

    trim1 = os.path.join(tempdir, "fastq_R1_trimmed.fastq.gz")
    trim2 = os.path.join(tempdir, "fastq_R2_trimmed.fastq.gz")

    reports_dir = os.path.join(tempdir, 'fastqc_reports')
    if not os.path.exists(reports_dir):
        helpers.makedirs(reports_dir)

    rep1 = os.path.join(reports_dir, '{}_trimgalore_R1.html'.format(cell_id))
    rep2 = os.path.join(reports_dir, '{}_trimgalore_R2.html'.format(cell_id))
    qcrep1 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R1.html'.format(cell_id))
    qcrep2 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R2.html'.format(cell_id))
    qczip1 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R1.zip'.format(cell_id))
    qczip2 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R2.zip'.format(cell_id))

    run_trimgalore(fastq1, fastq2, trim1, trim2, 'trim_galore', 'cutadapt',
                   tempdir, config['adapter'], config['adapter2'], rep1, rep2,
                   qcrep1, qcrep2, qczip1, qczip2)

    return trim1, trim2
Exemplo n.º 17
0
def download_blob(blob_path, tempdir):
    outpath = os.path.join(tempdir, blob_path)
    helpers.makedirs(outpath, isfile=True)

    storageutils.download_blob(blob_path, outpath, storage='azureblob')

    return outpath
Exemplo n.º 18
0
def trim_fastqs(fastq1, fastq2, cell_id, tempdir, adapter, adapter2,
                trimgalore_docker):
    """
    run fastqc on both fastq files
    run trimgalore if needed, copy if not.
    """
    with helpers.getFileHandle(fastq1) as reader:
        if not reader.readline():
            return fastq1, fastq2

    trim1 = os.path.join(tempdir, "fastq_R1_trimmed.fastq.gz")
    trim2 = os.path.join(tempdir, "fastq_R2_trimmed.fastq.gz")

    reports_dir = os.path.join(tempdir, 'fastqc_reports')
    if not os.path.exists(reports_dir):
        helpers.makedirs(reports_dir)

    rep1 = os.path.join(reports_dir, '{}_trimgalore_R1.html'.format(cell_id))
    rep2 = os.path.join(reports_dir, '{}_trimgalore_R2.html'.format(cell_id))
    qcrep1 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R1.html'.format(cell_id))
    qcrep2 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R2.html'.format(cell_id))
    qczip1 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R1.zip'.format(cell_id))
    qczip2 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R2.zip'.format(cell_id))

    run_tg = RunTrimGalore(fastq1, fastq2, trim1, trim2, 'trim_galore',
                           'cutadapt', tempdir, adapter, adapter2, rep1, rep2,
                           qcrep1, qcrep2, qczip1, qczip2, trimgalore_docker)
    run_tg.run_trimgalore()
    run_tg.gather_outputs()

    return trim1, trim2
Exemplo n.º 19
0
def merge_postprocess_bams(inputs, output, tempdir, containers):
    helpers.makedirs(tempdir)
    merged_out = os.path.join(tempdir, 'merged_lanes.bam')

    picardutils.merge_bams(inputs,
                           merged_out,
                           docker_image=containers['picard'])
    bamutils.bam_index(merged_out,
                       merged_out + '.bai',
                       docker_image=containers['samtools'])

    sorted_bam = os.path.join(tempdir, 'sorted.bam')
    picardutils.bam_sort(merged_out,
                         sorted_bam,
                         tempdir,
                         docker_image=containers['picard'])

    markdups_metrics = os.path.join(tempdir, 'markdups_metrics.txt')
    picardutils.bam_markdups(sorted_bam,
                             output,
                             markdups_metrics,
                             tempdir,
                             docker_image=containers['picard'])

    bamutils.bam_index(output,
                       output + '.bai',
                       docker_image=containers['samtools'])
Exemplo n.º 20
0
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics,
                          output, tempdir, genome_labels):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd)

    cell_cycle_df = pd.read_csv(temp_output)

    cols_cell_cycle = cell_cycle_df.columns.values

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    out_dtypes = dtypes(genome_labels)
    for colname in cols_cell_cycle:
        hmm_metrics_df[colname] = hmm_metrics_df[colname].astype(
            out_dtypes[colname])

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output,
                                             out_dtypes)
Exemplo n.º 21
0
def concatenate_vcf(in_files,
                    out_file,
                    tempdir,
                    docker_config={},
                    allow_overlap=False,
                    bcf_index_file=None,
                    vcf_index_file=None):
    """ Fast concatenation of VCF file using `bcftools`.

    :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys.

    :param out_file: path where output file will be written in VCF format.

    """
    helpers.makedirs(tempdir)

    merged_file = os.path.join(tempdir, 'merged.vcf')
    if allow_overlap:
        cmd = ['bcftools', 'concat', '-a', '-O', 'z', '-o', merged_file]
    else:
        cmd = ['bcftools', 'concat', '-O', 'z', '-o', merged_file]

    cmd += flatten_input(in_files)

    pypeliner.commandline.execute(*cmd, **docker_config)

    #sort merged vcf file
    cmd = ['bcftools', 'sort', '-O', 'z', '-o', out_file, merged_file]
    pypeliner.commandline.execute(*cmd, **docker_config)

    index_vcf(out_file, docker_config, index_file=vcf_index_file)
    index_bcf(out_file, docker_config, index_file=bcf_index_file)
Exemplo n.º 22
0
def picard_wgs_dup(
    input_bam,
    markdups_bam,
    markdups_metrics,
    tempdir,
    ref_genome,
    wgs_metrics,
    picard_wgs_params,
):
    tempdir_markdups = os.path.join(tempdir, 'markdups')
    helpers.makedirs(tempdir_markdups)

    picardutils.bam_markdups(
        input_bam,
        markdups_bam,
        markdups_metrics,
        tempdir_markdups,
    )

    tempdir_wgs = os.path.join(tempdir, 'wgs')
    helpers.makedirs(tempdir_wgs)

    picardutils.bam_collect_wgs_metrics(
        input_bam,
        ref_genome,
        wgs_metrics,
        picard_wgs_params,
        tempdir_wgs,
    )
Exemplo n.º 23
0
def organism_filter(fastq_r1, fastq_r2, filtered_fastq_r1, filtered_fastq_r2,
                    detailed_metrics, summary_metrics, tempdir, cell_id,
                    params):
    # fastq screen tries to skip if files from old runs are available
    if os.path.exists(tempdir):
        shutil.rmtree(tempdir)

    helpers.makedirs(tempdir)

    tagged_fastq_r1, tagged_fastq_r2 = run_fastq_screen_paired_end(
        fastq_r1,
        fastq_r2,
        tempdir,
        params,
    )

    reader = fastqutils.PairedTaggedFastqReader(tagged_fastq_r1,
                                                tagged_fastq_r2)
    counts = reader.gather_counts()

    write_detailed_counts(counts, detailed_metrics, cell_id, params)
    write_summary_counts(counts, summary_metrics, cell_id, params)

    utils.filter_tag_reads(tagged_fastq_r1, tagged_fastq_r2, filtered_fastq_r1,
                           filtered_fastq_r2, params)
Exemplo n.º 24
0
def write_svtyper_annotations(csv, output_paths, tempdir):
    """
    writes the annotations contained in the below
    annotations list to files, each to their own

    :param csv: csv file containg annotations as features
    :type csv:
    :param output_paths: output directories for annotation files
    :type output_paths:
    :param tempdir:
    :type tempdir:
    :return:
    :rtype:
    """
    helpers.makedirs(tempdir)

    annotations = [
        "AO", "AP", "AS", "ASC", "DP", "GQ", "QA", "QR", "RO", "RP", "RS",
        "SQ", "GL", "AB"
    ]

    csv = pd.read_csv(csv, delimiter=",")

    for annotation in annotations:
        temp_output_path = os.path.join(tempdir,
                                        '{}.csv.gz'.format(annotation))

        write_svtyper_annotation(annotation, csv, temp_output_path)

        csvutils.finalize_csv(temp_output_path, output_paths[annotation])
Exemplo n.º 25
0
def create_hmmcopy_data_tar(infiles, tar_output, tempdir):
    helpers.makedirs(tempdir)

    for key, infile in infiles.items():
        helpers.extract_tar(infile, os.path.join(tempdir, key))

    helpers.make_tarfile(tar_output, tempdir)
Exemplo n.º 26
0
def picard_insert_gc_flagstat(input_bam,
                              ref_genome,
                              gc_metrics,
                              gc_metrics_summary,
                              gc_metrics_pdf,
                              tempdir,
                              flagstat_metrics,
                              insert_metrics,
                              insert_pdf,
                              picard_docker=None,
                              samtools_docker=None):
    bamutils.bam_flagstat(input_bam,
                          flagstat_metrics,
                          docker_image=samtools_docker)

    gc_tempdir = os.path.join(tempdir, 'gc')
    helpers.makedirs(gc_tempdir)

    picardutils.bam_collect_gc_metrics(input_bam,
                                       ref_genome,
                                       gc_metrics,
                                       gc_metrics_summary,
                                       gc_metrics_pdf,
                                       gc_tempdir,
                                       docker_image=picard_docker)

    insert_tempdir = os.path.join(tempdir, 'insert')
    helpers.makedirs(insert_tempdir)
    picardutils.bam_collect_insert_metrics(input_bam,
                                           flagstat_metrics,
                                           insert_metrics,
                                           insert_pdf,
                                           insert_tempdir,
                                           docker_image=picard_docker)
Exemplo n.º 27
0
def bam_sort(bam_filename, sorted_bam_filename, tempdir, mem="2G"):
    if not os.path.exists(tempdir):
        makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1',
        'SortSam', 'INPUT=' + bam_filename, 'OUTPUT=' + sorted_bam_filename,
        'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT',
        'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', 'QUIET=true')
Exemplo n.º 28
0
def generate_qc_report(tempdir, reference_gc, fastqscreen_training_data,
                       metrics_df, gc_metrics_df, qc_report,
                       metrics_df_annotated):
    helpers.makedirs(tempdir)
    fastqscreen_classify.classify_fastqscreen(fastqscreen_training_data,
                                              metrics_df, metrics_df_annotated,
                                              dtypes()['metrics'])
    generate_qc.generate_html_report(tempdir, qc_report, reference_gc,
                                     metrics_df, gc_metrics_df)
Exemplo n.º 29
0
def tar_align_data(infiles, tar_output, tempdir):
    helpers.makedirs(tempdir)

    for infile in infiles:
        for key, filepath in infile.items():
            temp_path = os.path.join(
                tempdir, '{}_{}'.format(key, os.path.basename(filepath)))
            helpers.copyfile(filepath, temp_path)

    helpers.make_tarfile(tar_output, tempdir)
Exemplo n.º 30
0
def align_pe(fastq1, fastq2, output, reports_dir, tempdir, reference, trim,
             centre, sample_info, cell_id, lane_id, library_id, aligner,
             containers, adapter, adapter2, fastqscreen_detailed_metrics,
             fastqscreen_summary_metrics, fastqscreen_params):
    fastqscreen_tempdir = os.path.join(tempdir, 'fastq_screen')
    helpers.makedirs(fastqscreen_tempdir)

    filtered_fastq_r1 = os.path.join(fastqscreen_tempdir, "fastq_r1.fastq.gz")
    filtered_fastq_r2 = os.path.join(fastqscreen_tempdir, "fastq_r2.fastq.gz")

    fastqscreen.organism_filter(
        fastq1,
        fastq2,
        filtered_fastq_r1,
        filtered_fastq_r2,
        fastqscreen_detailed_metrics,
        fastqscreen_summary_metrics,
        fastqscreen_tempdir,
        cell_id,
        fastqscreen_params,
        reference,
        docker_image=containers['fastq_screen'],
        filter_contaminated_reads=fastqscreen_params[
            'filter_contaminated_reads'],
    )

    readgroup = get_readgroup(lane_id, cell_id, library_id, centre,
                              sample_info)

    run_fastqc(filtered_fastq_r1, filtered_fastq_r2, reports_dir, tempdir,
               containers)

    aln_temp = os.path.join(tempdir, "temp_alignments.bam")

    if aligner == "bwa-aln" and trim:
        filtered_fastq_r1, filtered_fastq_r2 = trim_fastqs(
            filtered_fastq_r1, filtered_fastq_r2, cell_id, tempdir, adapter,
            adapter2, containers['trimgalore'])

    align_pe_with_bwa(filtered_fastq_r1,
                      filtered_fastq_r2,
                      aln_temp,
                      reference,
                      readgroup,
                      tempdir,
                      containers,
                      aligner=aligner)

    picardutils.bam_sort(aln_temp,
                         output,
                         tempdir,
                         docker_image=containers['picard'])

    metrics = os.path.join(reports_dir, 'flagstat_metrics.txt')
    bamutils.bam_flagstat(output, metrics, docker_image=containers['samtools'])