def fq_trimming(pipeline, fq1_files, fq2_files, clinseq_barcode, ref, outdir, maxcores=1): fq1_abs = [normpath(x) for x in fq1_files] fq2_abs = [normpath(x) for x in fq2_files] logging.debug("Trimming {} and {}".format(fq1_abs, fq2_abs)) pairs = [(fq1_abs[k], fq2_abs[k]) for k in range(len(fq1_abs))] fq1_trimmed = [] fq2_trimmed = [] for fq1, fq2 in pairs: skewer = Skewer() skewer.input1 = fq1 skewer.input2 = fq2 skewer.output1 = outdir + "/skewer/libs/{}".format( os.path.basename(fq1)) skewer.output2 = outdir + "/skewer/libs/{}".format( os.path.basename(fq2)) skewer.stats = outdir + "/skewer/libs/skewer-stats-{}.log".format( os.path.basename(fq1)) skewer.threads = maxcores skewer.jobname = "skewer/{}".format(os.path.basename(fq1)) skewer.scratch = pipeline.scratch skewer.is_intermediate = True fq1_trimmed.append(skewer.output1) fq2_trimmed.append(skewer.output2) pipeline.add(skewer) cat1 = Cat() cat1.input = fq1_trimmed cat1.output = outdir + "/skewer/{}-concatenated_1.fastq.gz".format( clinseq_barcode) cat1.jobname = "cat1/{}".format(clinseq_barcode) cat1.is_intermediate = True pipeline.add(cat1) cat2 = Cat() cat2.input = fq2_trimmed cat2.jobname = "cat2/{}".format(clinseq_barcode) cat2.output = outdir + "/skewer/{}-concatenated_2.fastq.gz".format( clinseq_barcode) cat2.is_intermediate = True pipeline.add(cat2) return cat1.output, cat2.output
def align_se(pipeline, fq1_files, clinseq_barcode, ref, outdir, maxcores, remove_duplicates=True): """ Align single end data :param pipeline: :param fq1_files: :param lib: :param ref: :param outdir: :param maxcores: :param remove_duplicates: :return: """ logging.debug("Aligning files: {}".format(fq1_files)) fq1_abs = [normpath(x) for x in fq1_files] fq1_trimmed = [] for fq1 in fq1_abs: skewer = Skewer() skewer.input1 = fq1 skewer.input2 = None skewer.output1 = outdir + "/skewer/{}".format(os.path.basename(fq1)) skewer.output2 = outdir + "/skewer/unused-dummyfq2-{}".format(os.path.basename(fq1)) skewer.stats = outdir + "/skewer/skewer-stats-{}.log".format(os.path.basename(fq1)) skewer.threads = maxcores skewer.jobname = "skewer/{}".format(os.path.basename(fq1)) skewer.scratch = pipeline.scratch skewer.is_intermediate = True fq1_trimmed.append(skewer.output1) pipeline.add(skewer) cat1 = Cat() cat1.input = fq1_trimmed cat1.output = outdir + "/skewer/{}_1.fastq.gz".format(clinseq_barcode) cat1.jobname = "cat/{}".format(clinseq_barcode) cat1.is_intermediate = False pipeline.add(cat1) bwa = Bwa() bwa.input_fastq1 = cat1.output bwa.input_reference_sequence = ref bwa.remove_duplicates = remove_duplicates library_id = parse_prep_id(clinseq_barcode) sample_string = compose_sample_str(extract_unique_capture(clinseq_barcode)) bwa.readgroup = "\"@RG\\tID:{rg_id}\\tSM:{rg_sm}\\tLB:{rg_lb}\\tPL:ILLUMINA\"".format(\ rg_id=clinseq_barcode, rg_sm=sample_string, rg_lb=library_id) bwa.threads = maxcores bwa.output = "{}/{}.bam".format(outdir, clinseq_barcode) bwa.scratch = pipeline.scratch bwa.jobname = "bwa/{}".format(clinseq_barcode) bwa.is_intermediate = False pipeline.add(bwa) return bwa.output
def __init__(self, genome_resources, outdir, maxcores=1, runner=Shellrunner()): PypedreamPipeline.__init__(self, normpath(outdir), runner=runner) self.genome_resources = genome_resources self.input_reference_sequence = "{}/human_g1k_v37_decoy.fasta.gz".format( genome_resources) self.cosmic_vcf = "{}/CosmicCodingMuts_v71.vcf.gz".format( genome_resources) self.qdnaseq_background = "{}/qdnaseq_background.Rdata".format( genome_resources) self.swegene_common_vcf = "{}/swegen_common.vcf.gz".format( genome_resources) self.thousand_genome_vcf = "{}/1000G_phase1.indels.b37.vcf.gz".format( genome_resources) self.mills_and_1000g_gold_standard = "{}/Mills_and_1000G_gold_standard.indels.b37.vcf.gz".format( genome_resources) self.brca_exchange = "{}/BrcaExchangeClinvar_15Jan2019_v26_hg19.vcf.gz".format( genome_resources) self.oncokb = "{}/OncoKB_6Mar19_v1.9.txt".format(genome_resources) self.outdir = outdir self.maxcores = maxcores self.reference_data = dict() self.exac_remote = "ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/ExAC.r0.3.1.sites.vep.vcf.gz" self.dbsnp_remote = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b149_GRCh37p13/VCF/All_20161121.vcf.gz" self.clinvar_remote = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/archive_1.0/2016/clinvar_20160203.vcf.gz" self.icgc_somatic_remote = "https://dcc.icgc.org/api/v1/download?fn=/release_20/Summary/simple_somatic_mutation.aggregated.vcf.gz" self.ensembl_version = "75" self.ensembl_gtf_remote = "ftp://ftp.ensembl.org/pub/release-" + self.ensembl_version + \ "/gtf/homo_sapiens/Homo_sapiens.GRCh37." + self.ensembl_version + ".gtf.gz" self.mitranscriptome_remote = "http://mitranscriptome.org/download/mitranscriptome.gtf.tar.gz" self.prepare_reference_genome() self.prepare_genes() self.prepare_sveffect_regions() self.prepare_intervals() self.prepare_variants() fetch_vep_cache = InstallVep() fetch_vep_cache.output_dir = "{}/vep/".format(self.outdir) #self.add(fetch_vep_cache) self.reference_data['vep_dir'] = fetch_vep_cache.output_dir self.make_ref_paths_relative() with open("{}/autoseq-genome.json".format(self.outdir), "w") as output_file: json.dump(self.reference_data, output_file, indent=4, sort_keys=True)
def find_fastqs(library, libdir): """Find fastq files for a given library id in a given direcory. Returns a tuple with two lists: (['foo_1.fastq.gz', 'bar_1.fastq.gz'], # read 1 ['foo_2.fastq.gz', 'bar_2.fastq.gz']) Supports the following file naming convenstions: *_1.fastq.gz / *_2.fastq.gz *_1.fq.gz / *_2.fq.gz *R1_nnn.fastq.gz / *R2_nnn.fastq.gz :rtype: tuple[str,str] """ if not library: return (None, None) regex_fq1 = '(.+)(_1\.fastq.gz|_1\.fq.gz|R1_\d{3}.fastq.gz)' regex_fq2 = '(.+)(_2\.fastq.gz|_2\.fq.gz|R2_\d{3}.fastq.gz)' d = normpath(os.path.join(libdir, library)) logger.debug( "Looking for fastq files for library {library} in {libdir}".format( library=library, libdir=libdir)) fq1s = [] fq2s = [] for f in os.listdir(d): match1 = re.search(regex_fq1, f) if match1: fn = "".join(match1.groups()) fq1s.append(os.path.join(libdir, library, fn)) match2 = re.search(regex_fq2, f) if match2: fn = "".join(match2.groups()) fq2s.append(os.path.join(libdir, library, fn)) fq1s.sort() fq2s.sort() logging.debug("Found {}".format((fq1s, fq2s))) return fq1s, fq2s
from autoseq.util.path import normpath alascca_test_outdir = normpath("~/tmp/alascca-test") alascca_purity_test_outdir = normpath("~/tmp/alascca-purity-test") liqbio_test_outdir = normpath("~/tmp/liqbio-test")
def align_pe(pipeline, fq1_files, fq2_files, clinseq_barcode, ref, outdir, maxcores=1, remove_duplicates=True): """ align paired end data :param pipeline: :param fq1_files: :param fq2_files: :param lib: :param ref: :param outdir: :param maxcores: :param remove_duplicates: :return: """ fq1_abs = [normpath(x) for x in fq1_files] fq2_abs = [normpath(x) for x in fq2_files] logging.debug("Trimming {} and {}".format(fq1_abs, fq2_abs)) pairs = [(fq1_abs[k], fq2_abs[k]) for k in range(len(fq1_abs))] fq1_trimmed = [] fq2_trimmed = [] for fq1, fq2 in pairs: skewer = Skewer() skewer.input1 = fq1 skewer.input2 = fq2 skewer.output1 = outdir + "/skewer/libs/{}".format(os.path.basename(fq1)) skewer.output2 = outdir + "/skewer/libs/{}".format(os.path.basename(fq2)) skewer.stats = outdir + "/skewer/libs/skewer-stats-{}.log".format(os.path.basename(fq1)) skewer.threads = maxcores skewer.jobname = "skewer/{}".format(os.path.basename(fq1)) skewer.scratch = pipeline.scratch skewer.is_intermediate = True fq1_trimmed.append(skewer.output1) fq2_trimmed.append(skewer.output2) pipeline.add(skewer) cat1 = Cat() cat1.input = fq1_trimmed cat1.output = outdir + "/skewer/{}-concatenated_1.fastq.gz".format(clinseq_barcode) cat1.jobname = "cat1/{}".format(clinseq_barcode) cat1.is_intermediate = True pipeline.add(cat1) cat2 = Cat() cat2.input = fq2_trimmed cat2.jobname = "cat2/{}".format(clinseq_barcode) cat2.output = outdir + "/skewer/{}-concatenated_2.fastq.gz".format(clinseq_barcode) cat2.is_intermediate = True pipeline.add(cat2) bwa = Bwa() bwa.input_fastq1 = cat1.output bwa.input_fastq2 = cat2.output bwa.input_reference_sequence = ref bwa.remove_duplicates = remove_duplicates library_id = parse_prep_id(clinseq_barcode) sample_string = compose_sample_str(extract_unique_capture(clinseq_barcode)) bwa.readgroup = "\"@RG\\tID:{rg_id}\\tSM:{rg_sm}\\tLB:{rg_lb}\\tPL:ILLUMINA\"".format(\ rg_id=clinseq_barcode, rg_sm=sample_string, rg_lb=library_id) bwa.threads = maxcores bwa.output = "{}/{}.bam".format(outdir, clinseq_barcode) bwa.jobname = "bwa/{}".format(clinseq_barcode) bwa.scratch = pipeline.scratch bwa.is_intermediate = False pipeline.add(bwa) return bwa.output