def split_bams(cnf, samples, vcf_fpath):
    variants_by_chrom = parse_variants(vcf_fpath)
    temp_output_dirpath = join(cnf.work_dir, 'temp')
    safe_mkdir(temp_output_dirpath)
    info('Splitting BAM files...')
    for chrom, variants in variants_by_chrom.iteritems():
        chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
        chr_lengths_dict = dict((c, l) for (c, l) in chr_lengths)
        chr_length = chr_lengths_dict[chrom]
        transcripts = get_transcipts_with_exons_from_features(verify_file(cnf.features, is_critical=True), cur_chrom=chrom)
        bams_created_before = []
        bams_by_sample = defaultdict(list)
        info('Extracting variant coverage for all samples for ' + chrom + ', ' + str(len(variants)) + ' variants')
        for variant in variants:
            variant_bams_by_sample = extract_variant_from_bams(cnf, temp_output_dirpath,
                 transcripts, chr_length, samples, chrom, variant, bams_created_before)
            bams_created_before.extend(variant_bams_by_sample.values())
            for sample_name, bam_fpath in variant_bams_by_sample.iteritems():
                bams_by_sample[sample_name].append(bam_fpath)
        chrom = chrom.replace('chr', '')
        info()
        for sample_name, bam_fpaths in bams_by_sample.iteritems():
            info('Making combined BAMs for chr' + chrom + ' for sample ' + sample_name)
            bam_fname = '{chrom}-{sample_name}.bam'.format(**locals())
            temp_combined_bam_fpath = join(temp_output_dirpath, bam_fname)
            combined_bam_fpath = join(cnf.output_dir, bam_fname)
            generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath)
            info()
    info('Removing BAM files...')
    shutil.rmtree(temp_output_dirpath, ignore_errors=True)
예제 #2
0
def main():
    if len(sys.argv) <= 2:
        critical('Usage: ' + __file__ + ' path_to_.fa')

    seq_fpath = sys.argv[1]
    seq_fpath = verify_file(seq_fpath, is_critical=True)
    chr_lengths = get_chr_lengths_from_seq(seq_fpath)

    for c, l in chr_lengths:
        sys.stdout.write(c + '\t' + str(l) + '\n')
예제 #3
0
def sort_bed(cnf, input_bed_fpath, output_bed_fpath=None):
    input_bed_fpath = verify_bed(input_bed_fpath)
    output_bed_fpath = adjust_path(
        output_bed_fpath) if output_bed_fpath else intermediate_fname(
            cnf, input_bed_fpath, 'sorted')

    class Region(SortableByChrom):
        def __init__(self, chrom, start, end, other_fields, chrom_ref_order):
            SortableByChrom.__init__(self, chrom, chrom_ref_order)
            self.start = start
            self.end = end
            self.chrom_ref_order = chrom_ref_order
            self.other_fields = tuple(other_fields)

        def get_key(self):
            return self.chrom_ref_order, self.start, self.end, self.other_fields

    regions = []
    chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    info('Sorting regions in ' + input_bed_fpath)
    if cnf.reuse_intermediate and isfile(output_bed_fpath) and verify_bed(
            output_bed_fpath):
        info(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    with open(input_bed_fpath) as f:
        with file_transaction(cnf.work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    info('Sorted ' + str(len(regions)) + ' regions, saved to ' +
         output_bed_fpath + '\n')
    return output_bed_fpath
예제 #4
0
def main():
    input_bed_fpath, output_bed_fpath, work_dirpath, cnf = _read_args(sys.argv[1:])

    chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
    chrom_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    preprocessed_fpath, bed_params = _preprocess(cnf, input_bed_fpath, work_dirpath, chrom_order)
    annotated_fpaths = _annotate(preprocessed_fpath, work_dirpath, cnf)

    _postprocess(preprocessed_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order)
    if not cnf.debug:
        for f in [preprocessed_fpath] + annotated_fpaths:
            os.remove(f)

    try:
        shutil.rmtree(work_dirpath)
    except OSError:
        pass
def main():
    input_bed_fpath, output_bed_fpath, work_dirpath, cnf = _read_args(
        sys.argv[1:])

    chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
    chrom_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    preprocessed_fpath, bed_params = _preprocess(cnf, input_bed_fpath,
                                                 work_dirpath, chrom_order)
    annotated_fpaths = _annotate(preprocessed_fpath, work_dirpath, cnf)

    _postprocess(preprocessed_fpath, annotated_fpaths, bed_params,
                 output_bed_fpath, cnf, chrom_order)
    if not cnf.debug:
        for f in [preprocessed_fpath] + annotated_fpaths:
            os.remove(f)

    try:
        shutil.rmtree(work_dirpath)
    except OSError:
        pass
예제 #6
0
def main():
    if len(sys.argv) < 4:
        info(
            'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.'
        )
        # info('When the gene name is found in HGNC, it get replaced with an approved name.         ')
        # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is.  ')
        info(
            '                                                                                      '
        )
        info(
            'Usage:                                                                                '
        )
        info('    ' + __file__ +
             ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path +
             '] [additional_feature_list]')
        info(
            '                                                                                      '
        )
        info(
            '     where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:'
        )
        info(
            '     #Approved Symbol  Previous Symbols                    Synonyms                          Chromosome   Ensembl Gene ID   UCSC ID(supplied by UCSC)'
        )
        info(
            '     OR7E26P           OR7E67P, OR7E69P, OR7E70P, OR7E68P  OR1-51, OR1-72, OR1-73, OR912-95  19q13.43	    ENSG00000121410   uc002qsg.3'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz'
        )
        info(
            '     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";'
        )
        info(
            '     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz'
        )
        info(
            '     NC_000001.10    RefSeq          region       1       249250621       .       +       .       ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA'
        )
        info(
            '     NC_000001.10    BestRefSeq      gene         11874   14409           .       +       .       ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true'
        )
        info(
            '     NC_000001.10    BestRefSeq      transcript   11874   14409           .       +       .       ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     NC_000001.10    BestRefSeq      exon         11874   12227           .       +       .       ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:'
        )
        info(
            '     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol'
        )
        info(
            '     uc001aaa.3	         chr1	               +	                  11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     Writes to Exons.bed                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols'
        )
        sys.exit(1)

    genome_name = sys.argv[1]
    seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath
    canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath
    chr_lengths = get_chr_lengths_from_seq(seq_fpath)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    input_fpath = verify_file(sys.argv[2])
    output_fpath = adjust_path(sys.argv[3])

    synonyms_fpath = None
    if len(sys.argv) > 4:
        synonyms_fpath = verify_file(sys.argv[4])
        info('Synonyms file provided ' + synonyms_fpath + '')
    else:
        info('No synonyms file provided, skipping approving')

    not_approved_fpath = None
    if len(sys.argv) > 5:
        not_approved_fpath = adjust_path(sys.argv[5])

    with open(verify_file(canonical_transcripts_fpath)) as f:
        canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f)

    info('Reading the features...')
    with open_gzipsafe(input_fpath) as inp:
        l = inp.readline()
        if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'):
            gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath,
                                                       chr_order)
        elif output_fpath.endswith('.gff3') or output_fpath.endswith(
                '.gff3.gz'):
            gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath,
                                                       chr_order)
        else:
            gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order)

    if synonyms_fpath and synonyms_fpath != "''":
        gene_by_name_and_chrom, not_approved_gene_names = _approve(
            gene_by_name_and_chrom, synonyms_fpath)

        info('')
        info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) +
             ' genes.')
        if not_approved_fpath:
            with open(not_approved_fpath, 'w') as f:
                f.write('#Searched as\tStatus\n')
                f.writelines((l + '\n' for l in not_approved_gene_names))
            info('Saved not approved to ' + not_approved_fpath)

        # with open('serialized_genes.txt', 'w') as f:
        #     for g in gene_by_name.values():
        #         f.write(str(g) + '\t' + str(g.db_id) + '\n')
        #         for e in g.exons:
        #             f.write('\t' + str(e) + '\n')

    info('Found:')
    info('  ' + str(len(gene_by_name_and_chrom)) + ' genes')

    genes = gene_by_name_and_chrom.values()

    coding_and_mirna_genes = [
        g for g in genes
        if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts)
    ]

    coding_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'protein_coding' for t in g.transcripts)
    ]
    coding_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'protein_coding'
    ]
    mirna_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA' for t in g.transcripts)
    ]
    mirna_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'miRNA'
    ]
    codingmiRNA_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA'
               for t in g.transcripts) and any(t.biotype == 'protein_coding'
                                               for t in g.transcripts)
    ]
    info('  ' + str(len(coding_genes)) + ' coding genes')
    info('  ' + str(len(coding_transcripts)) + ' coding transcripts')
    info('  ' + str(len(mirna_genes)) + ' miRNA genes')
    info('  ' + str(len(mirna_transcripts)) + ' miRNA transcripts')
    info('  ' + str(len(codingmiRNA_genes)) +
         ' genes with both coding and miRNA transcripts')

    info()
    # info('Choosing genes with exons...')
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]

    info('Choosing canonical...')
    canon_genes = choose_canonical(genes, canonical_transcripts_ids)

    info()
    info('Sorting and printing all regions...')
    print_genes(genes, output_fpath, canon_only=False)

    info()
    info('Sorting and printing canonical regions...')
    canon_output_fpath = add_suffix(output_fpath, 'canon')
    print_genes(canon_genes, canon_output_fpath, canon_only=True)

    info()
    info('Saved all regions to\n   ' + output_fpath + '\n   ' +
         canon_output_fpath)