def split_bams(cnf, samples, vcf_fpath): variants_by_chrom = parse_variants(vcf_fpath) temp_output_dirpath = join(cnf.work_dir, 'temp') safe_mkdir(temp_output_dirpath) info('Splitting BAM files...') for chrom, variants in variants_by_chrom.iteritems(): chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chr_lengths_dict = dict((c, l) for (c, l) in chr_lengths) chr_length = chr_lengths_dict[chrom] transcripts = get_transcipts_with_exons_from_features(verify_file(cnf.features, is_critical=True), cur_chrom=chrom) bams_created_before = [] bams_by_sample = defaultdict(list) info('Extracting variant coverage for all samples for ' + chrom + ', ' + str(len(variants)) + ' variants') for variant in variants: variant_bams_by_sample = extract_variant_from_bams(cnf, temp_output_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before) bams_created_before.extend(variant_bams_by_sample.values()) for sample_name, bam_fpath in variant_bams_by_sample.iteritems(): bams_by_sample[sample_name].append(bam_fpath) chrom = chrom.replace('chr', '') info() for sample_name, bam_fpaths in bams_by_sample.iteritems(): info('Making combined BAMs for chr' + chrom + ' for sample ' + sample_name) bam_fname = '{chrom}-{sample_name}.bam'.format(**locals()) temp_combined_bam_fpath = join(temp_output_dirpath, bam_fname) combined_bam_fpath = join(cnf.output_dir, bam_fname) generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath) info() info('Removing BAM files...') shutil.rmtree(temp_output_dirpath, ignore_errors=True)
def main(): if len(sys.argv) <= 2: critical('Usage: ' + __file__ + ' path_to_.fa') seq_fpath = sys.argv[1] seq_fpath = verify_file(seq_fpath, is_critical=True) chr_lengths = get_chr_lengths_from_seq(seq_fpath) for c, l in chr_lengths: sys.stdout.write(c + '\t' + str(l) + '\n')
def sort_bed(cnf, input_bed_fpath, output_bed_fpath=None): input_bed_fpath = verify_bed(input_bed_fpath) output_bed_fpath = adjust_path( output_bed_fpath) if output_bed_fpath else intermediate_fname( cnf, input_bed_fpath, 'sorted') class Region(SortableByChrom): def __init__(self, chrom, start, end, other_fields, chrom_ref_order): SortableByChrom.__init__(self, chrom, chrom_ref_order) self.start = start self.end = end self.chrom_ref_order = chrom_ref_order self.other_fields = tuple(other_fields) def get_key(self): return self.chrom_ref_order, self.start, self.end, self.other_fields regions = [] chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} info('Sorting regions in ' + input_bed_fpath) if cnf.reuse_intermediate and isfile(output_bed_fpath) and verify_bed( output_bed_fpath): info(output_bed_fpath + ' exists, reusing') return output_bed_fpath with open(input_bed_fpath) as f: with file_transaction(cnf.work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') info('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath + '\n') return output_bed_fpath
def main(): input_bed_fpath, output_bed_fpath, work_dirpath, cnf = _read_args(sys.argv[1:]) chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chrom_order = {c: i for i, (c, l) in enumerate(chr_lengths)} preprocessed_fpath, bed_params = _preprocess(cnf, input_bed_fpath, work_dirpath, chrom_order) annotated_fpaths = _annotate(preprocessed_fpath, work_dirpath, cnf) _postprocess(preprocessed_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order) if not cnf.debug: for f in [preprocessed_fpath] + annotated_fpaths: os.remove(f) try: shutil.rmtree(work_dirpath) except OSError: pass
def main(): input_bed_fpath, output_bed_fpath, work_dirpath, cnf = _read_args( sys.argv[1:]) chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chrom_order = {c: i for i, (c, l) in enumerate(chr_lengths)} preprocessed_fpath, bed_params = _preprocess(cnf, input_bed_fpath, work_dirpath, chrom_order) annotated_fpaths = _annotate(preprocessed_fpath, work_dirpath, cnf) _postprocess(preprocessed_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order) if not cnf.debug: for f in [preprocessed_fpath] + annotated_fpaths: os.remove(f) try: shutil.rmtree(work_dirpath) except OSError: pass
def main(): if len(sys.argv) < 4: info( 'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.' ) # info('When the gene name is found in HGNC, it get replaced with an approved name. ') # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is. ') info( ' ' ) info( 'Usage: ' ) info(' ' + __file__ + ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path + '] [additional_feature_list]') info( ' ' ) info( ' where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:' ) info( ' #Approved Symbol Previous Symbols Synonyms Chromosome Ensembl Gene ID UCSC ID(supplied by UCSC)' ) info( ' OR7E26P OR7E67P, OR7E69P, OR7E70P, OR7E68P OR1-51, OR1-72, OR1-73, OR912-95 19q13.43 ENSG00000121410 uc002qsg.3' ) info( ' ... ' ) info( ' ' ) info( ' or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz' ) info( ' 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";' ) info( ' 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";' ) info( ' ... ' ) info( ' ' ) info( ' or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz' ) info( ' NC_000001.10 RefSeq region 1 249250621 . + . ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA' ) info( ' NC_000001.10 BestRefSeq gene 11874 14409 . + . ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true' ) info( ' NC_000001.10 BestRefSeq transcript 11874 14409 . + . ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' ... ' ) info( ' ' ) info( ' or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:' ) info( ' #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol' ) info( ' uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1' ) info( ' ... ' ) info( ' ' ) info( ' Writes to Exons.bed ' ) info( ' ' ) info( 'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols' ) sys.exit(1) genome_name = sys.argv[1] seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath chr_lengths = get_chr_lengths_from_seq(seq_fpath) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} input_fpath = verify_file(sys.argv[2]) output_fpath = adjust_path(sys.argv[3]) synonyms_fpath = None if len(sys.argv) > 4: synonyms_fpath = verify_file(sys.argv[4]) info('Synonyms file provided ' + synonyms_fpath + '') else: info('No synonyms file provided, skipping approving') not_approved_fpath = None if len(sys.argv) > 5: not_approved_fpath = adjust_path(sys.argv[5]) with open(verify_file(canonical_transcripts_fpath)) as f: canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f) info('Reading the features...') with open_gzipsafe(input_fpath) as inp: l = inp.readline() if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'): gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath, chr_order) elif output_fpath.endswith('.gff3') or output_fpath.endswith( '.gff3.gz'): gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath, chr_order) else: gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order) if synonyms_fpath and synonyms_fpath != "''": gene_by_name_and_chrom, not_approved_gene_names = _approve( gene_by_name_and_chrom, synonyms_fpath) info('') info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.') if not_approved_fpath: with open(not_approved_fpath, 'w') as f: f.write('#Searched as\tStatus\n') f.writelines((l + '\n' for l in not_approved_gene_names)) info('Saved not approved to ' + not_approved_fpath) # with open('serialized_genes.txt', 'w') as f: # for g in gene_by_name.values(): # f.write(str(g) + '\t' + str(g.db_id) + '\n') # for e in g.exons: # f.write('\t' + str(e) + '\n') info('Found:') info(' ' + str(len(gene_by_name_and_chrom)) + ' genes') genes = gene_by_name_and_chrom.values() coding_and_mirna_genes = [ g for g in genes if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts) ] coding_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'protein_coding' for t in g.transcripts) ] coding_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'protein_coding' ] mirna_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) ] mirna_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'miRNA' ] codingmiRNA_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) and any(t.biotype == 'protein_coding' for t in g.transcripts) ] info(' ' + str(len(coding_genes)) + ' coding genes') info(' ' + str(len(coding_transcripts)) + ' coding transcripts') info(' ' + str(len(mirna_genes)) + ' miRNA genes') info(' ' + str(len(mirna_transcripts)) + ' miRNA transcripts') info(' ' + str(len(codingmiRNA_genes)) + ' genes with both coding and miRNA transcripts') info() # info('Choosing genes with exons...') # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] info('Choosing canonical...') canon_genes = choose_canonical(genes, canonical_transcripts_ids) info() info('Sorting and printing all regions...') print_genes(genes, output_fpath, canon_only=False) info() info('Sorting and printing canonical regions...') canon_output_fpath = add_suffix(output_fpath, 'canon') print_genes(canon_genes, canon_output_fpath, canon_only=True) info() info('Saved all regions to\n ' + output_fpath + '\n ' + canon_output_fpath)