def test_write_to_read_grouped_sorted(self): write_path = './data/write_test_rg.bam' read_groups = set() with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: input_header = f.header.as_dict() write_program_tag(input_header, program_name='test_bam_util_test1', command_line = " ".join(sys.argv), version = singlecellmultiomics.__version__, description = f'a description' ) write_program_tag(input_header, program_name='test_bam_util_test2', command_line = " ".join(sys.argv), version = singlecellmultiomics.__version__, description = f'a description' ) #print([x for x in input_header['PG'] if not 'bwa mem' in x.get('CL','')]) with sorted_bam_file(write_path, header=input_header,read_groups=read_groups) as out: for molecule in singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule, fragment_class=singlecellmultiomics.fragment.NlaIIIFragment, fragment_class_args={'umi_hamming_distance':0}, pooling_method=0, yield_invalid=True ): molecule.write_pysam(out) for frag in molecule: read_groups.add( frag.get_read_group() ) self.assertTrue(os.path.exists(write_path)) # Now test if the program tag is there... with pysam.AlignmentFile(write_path) as f: self.assertTrue( 1==len([x for x in f.header['PG'] if 'test_bam_util_test1' in x.get('PN','')]) ) self.assertTrue( 1==len([x for x in f.header['PG'] if 'test_bam_util_test2' in x.get('PN','')]) ) i =0 # Test if the file has reads. for read in f: if read.is_read1: i+=1 self.assertEqual(i, 293) try: os.remove(write_path) except Exception as e: pass try: os.remove(write_path+'.bai') except Exception as e: pass
def run_tagging_tasks(args: tuple): """ Run tagging for one or more tasks Args: args (tuple): (alignments_path, temp_dir, timeout_time), arglist """ (alignments_path, temp_dir, timeout_time), arglist = args target_file = f"{temp_dir}/{uuid4()}.bam" timeout_tasks = [] total_molecules = 0 read_groups = dict() with AlignmentFile(alignments_path) as alignments: with sorted_bam_file(target_file, origin_bam=alignments, mode='wb', fast_compression=False, read_groups=read_groups) as output: for task in arglist: try: statistics = run_tagging_task(alignments, output, read_groups=read_groups, timeout_time=timeout_time, **task) total_molecules += statistics.get('total_molecules_written', 0) except TimeoutError: timeout_tasks.append( task ) meta = { 'timeout_tasks' : timeout_tasks, 'total_molecules' : total_molecules, } if total_molecules>0: return target_file, meta else: # Clean up ? try: remove(target_file) remove(f'{target_file}.bai') except Exception as e: print(f'Cleaning up failed for {target_file}') print(e) pass return None, meta
def test_write_to_sorted_custom_compression(self): write_path = './data/write_test.bam' with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: with sorted_bam_file(write_path, origin_bam=f,fast_compression=True) as out: for molecule in singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule, fragment_class=singlecellmultiomics.fragment.NlaIIIFragment, fragment_class_args={'umi_hamming_distance':0}, pooling_method=0, yield_invalid=True ): molecule.write_pysam(out) self.assertTrue(os.path.exists(write_path)) try: os.remove(write_path) os.remove(write_path+'.bai') except Exception as e: pass
def test_write_to_sorted_non_existing_folder(self): write_folder = './data/non_yet_existing_folder/' write_path = write_folder + 'write_test.bam' if os.path.exists(write_path): os.remove(write_path) rmtree(write_folder, ignore_errors=True) with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: with sorted_bam_file(write_path, origin_bam=f) as out: for molecule in singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule. NlaIIIMolecule, fragment_class=singlecellmultiomics.fragment. NlaIIIFragment, fragment_class_args={'umi_hamming_distance': 0}, pooling_method=0, yield_invalid=True): molecule.write_pysam(out) self.assertTrue(os.path.exists(write_path)) with pysam.AlignmentFile(write_path) as f: i = 0 # Test if the file has reads. for read in f: if read.is_read1: i += 1 self.assertEqual(i, 293) try: os.remove(write_path) os.remove(write_path + '.bai') except Exception as e: pass rmtree(write_folder, ignore_errors=True)
def main(): parser = argparse.ArgumentParser( description= 'Dual signal unmixing, through a probablity matrix for each cell across bins probability a read is assigned to signal 1. Use this prob matrix with bam file to split a bam file into signal 1 and signal 2' ) parser.add_argument('-inbam', metavar='INFILE', help='Input bam file') parser.add_argument( '-inprobmat', metavar='INFILE', help= 'Tab sep matrix file. Columns are cell names (first fcolumn is ""). Rows are genomic bins. Values are probability of reads in bin assigned to mark1.' ) parser.add_argument( '-outdir', metavar='OUTDIR', help='Output directory for bams. Full name to be specified in script') parser.add_argument('-mapq', metavar='INTEGER 0 to 60', default=0, type=int, help='Minimum quality of read to be considered') parser.add_argument( '-binsize', metavar='Genomic binsize', default=50000, type=int, help= 'Binsize of genomic bins to consider (assumes row names are defined by nearest 50kb bins)' ) parser.add_argument( '--interpolation', action='store_true', help= 'Makes a linear interpolation of the bins in your probability matrix (no interpolation across chromosomes).' ) parser.add_argument('--quiet', '-q', action='store_true', help='Suppress some print statements') parser.add_argument('--logfile', '-l', metavar='LOGFILE', default=None, help='Write arguments to logfile') args = parser.parse_args() # store command line arguments for reproducibility CMD_INPUTS = ' '.join(['python'] + sys.argv) # easy printing later # store argparse inputs for reproducibility / debugging purposes args_dic = vars(args) # ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()] # for python2 ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.items()] # for python3 ARG_INPUTS = ' '.join(ARG_INPUTS) # Print arguments supplied by user if not args.quiet: if args.logfile is not None: sys.stdout = open(args.logfile, "w+") print(datetime.datetime.now().strftime('Code output on %c')) print('Command line inputs:') print(CMD_INPUTS) print('Argparse variables:') print(ARG_INPUTS) p = pd.read_csv(args.inprobmat, sep="\t", index_col=0) def parse_bin_name(binname): chrname, coords = binname.split(':') start, end = coords.split('-') return chrname, int(start), int(end) if not args.interpolation: prob = p if args.interpolation: def interpolate_prob_mat(p): new_rows = [] for index, (binA_orign, binB_orign) in enumerate(windowed(p.index, 2)): binA = binA_orign #parse_bin_name(binA_orign) binB = binB_orign #parse_bin_name(binB_orign) if binA[0] != binB[0]: continue if binA[2] > binB[1]: raise ValueError('The input is not sorted') contig = binA[0] binSize = binA[2] - binA[1] new_rows.append(p.loc[binA_orign, :]) start, end = binA[2], binB[1] for new_bin_start in range(binA[2], binB[1], binSize): new_bin_end = new_bin_start + binSize new_bin_centroid = new_bin_start + binSize * 0.5 # for every cell do interpolation dx = end - start d = (new_bin_centroid - start) dy = p.loc[binB_orign, :] - p.loc[binA_orign, :] interpolated = (dy / dx) * d + p.loc[binA_orign, :] interpolated.name = (contig, new_bin_start, new_bin_end) new_rows.append(interpolated) prob = pd.DataFrame(new_rows) indexNames = [ f'{chromosomes}:{starts}-{ends}' for chromosomes, starts, ends in prob.index ] prob.index = indexNames return prob p.index = pd.MultiIndex.from_tuples( [parse_bin_name(t) for t in p.index]) p = p.sort_index(0) prob = interpolate_prob_mat(p) prob.to_csv(os.path.join(args.outdir, "probabilityMatrix_linearInterpolated.csv"), sep='\t') #==========End interpolation============================================ prob.index = pd.MultiIndex.from_tuples( [parse_bin_name(t.replace('chr', '')) for t in prob.index]) prob.index.set_names(["chr", "start", "end"], inplace=True) bamFile = args.inbam wrote = 0 infboth = os.path.join(args.outdir, "both.bam") infA = os.path.join(args.outdir, "splitted_A.bam") infB = os.path.join(args.outdir, "splitted_B.bam") with pysam.AlignmentFile(bamFile) as f: with sorted_bam_file(infboth, f) as both, sorted_bam_file( infA, origin_bam=f) as a, sorted_bam_file(infB, origin_bam=f) as b: for readId, (R1, R2) in enumerate(pysamiterators.MatePairIterator(f)): if R1.mapping_quality < args.mapq & R2.mapping_quality < args.mapq: continue # one of two reads should have sufficient MAPQ. Less stringent. Should be OK? if R1.is_duplicate: continue bin_start, bin_end = coordinate_to_bins( R1.get_tag('DS'), args.binsize, args.binsize)[0] # Obtain prob: bin_name = (R1.reference_name, bin_start, bin_end) if not bin_name in prob.index: continue if R1.get_tag('SM') not in prob.columns: continue p = prob.loc[bin_name, R1.get_tag('SM')] wrote += 1 group = 'A' if np.random.random() <= p else 'B' R1.set_tag('Gr', group) R2.set_tag('Gr', group) if group == 'A': a.write(R1) a.write(R2) else: b.write(R1) b.write(R2) both.write(R1) both.write(R2) print("Number of reads written:" + str(wrote))
def main(): parser = argparse.ArgumentParser( description= 'After downstream unmixing, we get a probablity matrix for each cell across bins probability a read is assigned to mark1. Use this prob matrix with bam file to split a bam file into mark1 and mark2' ) parser.add_argument('-inbam', metavar='INFILE', help='Input bam file') # /hpc/hub_oudenaarden/jyeung/data/dblchic/double_staining_output_downstream/unfixed_louvain2/SplitReads/MF_BM_unfixed_louvain2_clstr_by_louvain_K4m1_K27m3.removeNA_FALSE-prob_mat.K4m1-K27m3_to_K4m1.txt parser.add_argument( '-inprobmat', metavar='INFILE', help= 'Tab sep matrix file. Columns are cell names (first fcolumn is ""). Rows are genomic bins. Values are probability of reads in bin assigned to mark1.' ) parser.add_argument( '-outdir', metavar='OUTDIR', help='Output directory for bams. Full name to be specified in script') parser.add_argument('-mapq', metavar='INTEGER 0 to 60', default=40, type=int, help='Minimum quality of read to be considered') parser.add_argument( '-binsize', metavar='Genomic binsize', default=50000, type=int, help= 'Binsize of genomic bins to consider (assumes row names are defined by nearest 50kb bins)' ) parser.add_argument('--quiet', '-q', action='store_true', help='Suppress some print statements') parser.add_argument('--logfile', '-l', metavar='LOGFILE', default=None, help='Write arguments to logfile') args = parser.parse_args() # store command line arguments for reproducibility CMD_INPUTS = ' '.join(['python'] + sys.argv) # easy printing later # store argparse inputs for reproducibility / debugging purposes args_dic = vars(args) # ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()] # for python2 ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.items()] # for python3 ARG_INPUTS = ' '.join(ARG_INPUTS) # Print arguments supplied by user if not args.quiet: if args.logfile is not None: sys.stdout = open(args.logfile, "w+") print(datetime.datetime.now().strftime('Code output on %c')) print('Command line inputs:') print(CMD_INPUTS) print('Argparse variables:') print(ARG_INPUTS) # pathToTables = '/hpc/hub_oudenaarden/mflorescu/data/mnase/mm/mergedBAMs/' # prob = pd.read_csv(pathToTables + 'MF_BM_unfixed_clstr_by_topics.K4m1-K27m3.removeNA_FALSE-prob_mat.K4m1-K27m3_to_K4m1.txt', # sep='\t') prob = pd.read_csv(args.inprobmat, sep="\t") new = prob["Unnamed: 0"].str.split(':|-', n=3, expand=True) prob['chr'] = new[0] prob['start'] = new[1] prob['end'] = new[2] prob.set_index(['chr', 'start', 'end'], inplace=True) prob.drop(["Unnamed: 0"], axis=1, inplace=True) # bamFile = "/hpc/hub_oudenaarden/mflorescu/data/mnase/mm/mergedBAMs/all_BM_K4m1_K27m3_200119.bam" bamFile = args.inbam wrote = 0 infboth = os.path.join(args.outdir, "both.bam") infA = os.path.join(args.outdir, "splitted_A.bam") infB = os.path.join(args.outdir, "splitted_B.bam") with pysam.AlignmentFile(bamFile) as f: with sorted_bam_file(infboth, f) as both, sorted_bam_file( infA, origin_bam=f) as a, sorted_bam_file(infB, origin_bam=f) as b: for readId, (R1, R2) in enumerate( pysamiterators.MatePairIterator(f)): if R1.mapping_quality < args.mapq & R2.mapping_quality < args.mapq: continue # one of two reads should have sufficient MAPQ. Less stringent. Should be OK? if R1.is_duplicate: continue bin_start, bin_end = coordinate_to_bins( R1.get_tag('DS'), args.binsize, args.binsize)[0] # Obtain prob: bin_name = (f'chr{R1.reference_name}', str(bin_start), str(bin_end)) if not bin_name in prob.index: continue if R1.get_tag('SM') not in prob.columns: continue p = prob.loc[bin_name, R1.get_tag('SM')] wrote += 1 group = 'A' if np.random.random() <= p else 'B' R1.set_tag('Gr', group) R2.set_tag('Gr', group) if group == 'A': a.write(R1) a.write(R2) else: b.write(R1) b.write(R2) both.write(R1) both.write(R2) print("Number of reads written:" + str(wrote))
def obtain_conversions(contig: str): """ Create conversion dictionary for the suppled contig Args: contig (str) Returns: conversions_per_library (defaultdict( conversion_dict_stranded ) ) : Per library conversion dictionary n_molecules_per_library (Counter) : observed molecules per library contig(str) : the contig passed to the method temp_bam_path(str) : path to tagged bam file, tagged with gene annotations and 4su mutation count """ conversions_per_library = defaultdict(conversion_dict_stranded) n_molecules_per_library = Counter() from singlecellmultiomics.molecule import might_be_variant # Create temp directory to write tagged bam file to: temp_dir = args.temp_dir temp_bam_path = f'{temp_dir}/{contig}.bam' if not os.path.exists(temp_dir): try: os.makedirs(temp_dir) except Exception as e: pass # Load gene annotations for the selected contig: transcriptome_features = FeatureContainer() transcriptome_features.loadGTF(path=exons_gtf_path, select_feature_type=['exon'], identifierFields=('exon_id', 'gene_id'), store_all=True, contig=contig, head=None) transcriptome_features.loadGTF(path=introns_gtf_path, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, contig=contig, head=None) colormap = plt.get_cmap('RdYlBu_r') colormap.set_bad((0, 0, 0)) read_groups = {} try: with pysam.AlignmentFile(single_cell_bam_path, threads=4) as alignments, \ pysam.VariantFile(known_vcf_path) as known, \ sorted_bam_file(temp_bam_path, origin_bam=single_cell_bam_path, read_groups=read_groups, fast_compression=True) as out, \ pysam.FastaFile(reference_path) as reference_handle: # Cache the sequence of the contig: (faster) reference = CachedFasta(reference_handle) for n_molecules, molecule in enumerate( MoleculeIterator(alignments, TranscriptMolecule, SingleEndTranscriptFragment, fragment_class_args={ 'stranded': True, 'features': transcriptome_features }, molecule_class_args={ 'reference': reference, 'features': transcriptome_features, 'auto_set_intron_exon_features': True }, contig=contig)): # Read out mut spectrum consensus = molecule.get_consensus() if args.R2_based: molecule.strand = not molecule.strand # Invert becayse its R2 based. n_molecules_per_library[molecule.library] += 1 n_4su_mutations = 0 n_4su_contexts = 0 for (chrom, pos), base in consensus.items(): context = reference.fetch(chrom, pos - 1, pos + 2).upper() if len(context) != 3: continue if ((context[1] == 'A' and not molecule.strand) or (context[1] == 'T' and molecule.strand)): n_4su_contexts += 1 # Check if the base matches or the refence contains N's if context[1] == base or 'N' in context or len( context) != 3: continue # Ignore germline variants: if might_be_variant(chrom, pos, known): continue if not molecule.strand: # reverse template context = reverse_complement(context) base = complement(base) # Count 4SU specific mutations, and write to molecule later if context[1] == 'T' and base == 'C': n_4su_mutations += 1 conversions_per_library[molecule.library][(context, base)] += 1 # Write 4su modification to molecule molecule.set_meta('4S', n_4su_mutations) molecule.set_meta('4c', n_4su_contexts) # Set read color based on conversion rate: try: # The max color value will be 10% modification rate cfloat = colormap( np.clip(10 * (n_4su_mutations / n_4su_contexts), 0, 1))[:3] except Exception as e: cfloat = colormap._rgba_bad[:3] molecule.set_meta( 'YC', '%s,%s,%s' % tuple( (int(x * 255) for x in cfloat))) molecule.set_meta('4c', n_4su_contexts) molecule.write_tags() for fragment in molecule: rgid = fragment.get_read_group() if not rgid in read_groups: read_groups[rgid] = fragment.get_read_group( True)[1] # Write tagged molecule to output file molecule.write_pysam(out) except KeyboardInterrupt: # This allows you to cancel the analysis (CTRL+C) and get the current result pass return conversions_per_library, n_molecules_per_library, contig, temp_bam_path