def test_getOperationAtRefPos(self): self.assertEqual( SharedFunctions.getOperationAtRefPos(17, 15, "20M77D12I77M", 1, 1), ("M", 20)) self.assertEqual( SharedFunctions.getOperationAtRefPos(37, 15, "20M77D12I77M", 1, 1), ("D", 77)) self.assertEqual( SharedFunctions.getOperationAtRefPos(97, 1, "20M77D12I77M", 1, 1), ("I", 12)) self.assertEqual( SharedFunctions.getOperationAtRefPos(98, 1, "20M77D12I77M", 1, 1), ("M", 77))
def test_report_list_modes(self): inlist = [1, 2, 2, 2, 2, 3, 4] self.assertEqual( SharedFunctions.report_list_modes(inlist), [2]) inlist2 = ['+', '+', '+', '-'] self.assertEqual( SharedFunctions.report_list_modes(inlist2), ['+']) inlist_tie = [1, 1, 2, 2] self.assertEqual( SharedFunctions.report_list_modes(inlist_tie), [1, 2])
def test_get_not_gaps(self): start = 0 end = 1000 gaps = [(47, 51), (400, 640)] self.assertEqual( SharedFunctions.get_not_gaps(start, end, gaps), [(0, 47), (51, 400), (640, 1000)])
def test_updateFeatureGff(self): iesgff = SharedFunctions.Gff() iesgff.list2gff(TestInsert.gfflist) ins = Insert.Insert(TestInsert.ref, iesgff, TestInsert.ies) ins._filterInserts() ins._updatePositionsInserts() feats = [i.split("\t") for i in TestInsert.oldfeatures] newgff = ins.updateFeatureGff(feats, addsuffix=True) self.assertEqual( [[str(elem) for elem in i] for i in newgff if re.match('ID=gene1.', i[8])], [['ctg1','.','gene','3','5','.','.','.','ID=gene1.seg_0;key1=attr1;key2=attr2'], ['ctg1','.','gene','10','11','.','.','.','ID=gene1.seg_1;key1=attr1;key2=attr2']]) self.assertEqual( [[str(elem) for elem in i] for i in newgff if re.match('ID=gene2', i[8])], [['ctg1','.','gene','21','24','.','.','.','ID=gene2']]) # Test multi-segment feature spanning several lines self.assertEqual( [[str(elem) for elem in i] for i in newgff if re.match('ID=cds1', i[8])], [['ctg1','.','CDS','3','5','.','+','0','ID=cds1.seg_0;key1=attr1;key2=attr2'], ['ctg1','.','CDS','10','11','.','+','0','ID=cds1.seg_1;key1=attr1;key2=attr2'], ['ctg1','.','CDS','21','24','.','+','0','ID=cds1']]) # Test feature without added suffix newgff_nosuffix = ins.updateFeatureGff(feats, addsuffix=False) self.assertEqual( [[str(elem) for elem in i] for i in newgff_nosuffix if re.match('ID=gene1;', i[8])], [['ctg1','.','gene','3','5','.','.','.','ID=gene1;key1=attr1;key2=attr2'], ['ctg1','.','gene','10','11','.','.','.','ID=gene1;key1=attr1;key2=attr2']])
def test_getCigarOpQuerySeqs(self): qseq = "AATACCCATTA" cigartuples = [(4, 2), (0, 3), (1, 3), (2, 3), (4, 3)] rstart = 10 self.assertEqual( SharedFunctions.getCigarOpQuerySeqs( qseq, cigartuples, rstart, target_op="S"), [("AA", 0, 2, 10, 10), ("TTA", 8, 11, 16, 16)])
def test_reportInsertedReference(self): gff = SharedFunctions.Gff() gff.list2gff(TestInsert.gfflist) ins = Insert.Insert(TestInsert.ref, gff, TestInsert.ies) newfasta, newgff = ins.reportInsertedReference() self.assertEqual( str(newfasta['ctg1'].seq), 'AAAAATTTTAAAAGGGGGAAAAAAAAAAA') self.assertEqual( str(newfasta['ctg2'].seq), 'GGGGGGGGGCCCCCGGGGGGGGGGG')
def insert(args): logger = logging.getLogger("main.insert") run_boilerplate(logger) logger.info("Started BleTIES Insert") logger.info("Reading input files") refgenome = SeqIO.to_dict(SeqIO.parse(args.ref, "fasta")) gff = SharedFunctions.Gff() gff.file2gff(args.ies) if re.match(r"ins", args.mode): ies = SeqIO.to_dict(SeqIO.parse(args.iesfasta, "fasta")) logger.info( "Inserting IESs to MAC reference to make MAC+IES hybrid reference") ins = Insert.Insert(refgenome, gff, ies) newrefgenome, newgff = ins.reportInsertedReference() if args.featuregff: logger.info(f"Updating coords in feature table {args.featuregff}") # Slurp file into memory as list of lists with open(args.featuregff, 'r') as fh: feats = [ line.rstrip().split("\t") for line in fh if line[0] != '#' and len(line.rstrip().split("\t")) == 9 ] # Update coordinates of feature table to account for inserted IESs newfeaturegff = ins.updateFeatureGff(feats, args.addsuffix) outfeaturegff = f"{args.out}.iesplus.feature_table.gff" logger.info(f"Writing new feature table to {outfeaturegff}") with open(outfeaturegff, 'w') as fh: fh.write("##gff-version 3\n") for line in newfeaturegff: fh.write("\t".join([str(i) for i in line])) fh.write("\n") outfasta = f"{args.out}.iesplus.fasta" outgff = f"{args.out}.iesplus.gff" elif re.match(r"del", args.mode): logger.info( f"Removing IESs from MAC+IES reference to make MAC-IES reference") dels = Insert.Insert(refgenome, gff, None) newrefgenome, newgff = dels.reportDeletedReference() outfasta = f"{args.out}.iesminus.fasta" outgff = f"{args.out}.iesminus.gff" logger.info(f"Writing output files to {outfasta}, {outgff}") with open(outfasta, "w") as fh: SeqIO.write(list(newrefgenome.values()), fh, "fasta") newgff.gff2file(outgff, header=True) logger.info("Finished Insert")
def test_updateFeatureGff_tapointer(self): iesgff = SharedFunctions.Gff() iesgff.list2gff(TestInsert.gfflist) ins = Insert.Insert(TestInsert.ref, iesgff, TestInsert.ies) ins._filterInserts() ins._updatePositionsInserts() ins._updatePointerPositionsInserts() ins._addSequences() # print("\n".join(ins._newgff.gff2list())) # testing self.assertEqual(str(ins._newgff.getValue('ies6', 'start')), '15') self.assertEqual(str(ins._newgff.getValue('ies6', 'end')), '19') self.assertEqual(str(ins._newgff.getAttr('ies6', 'ta_pointer_start')), '16') self.assertEqual(str(ins._newgff.getAttr('ies6', 'ta_pointer_end')), '20') self.assertEqual(str(ins._newgff.getValue('ies7','start')), '11') self.assertEqual(str(ins._newgff.getValue('ies7','end')), '15') self.assertEqual(str(ins._newgff.getAttr('ies7','ta_pointer_start')), '11') self.assertEqual(str(ins._newgff.getAttr('ies7','ta_pointer_end')), '15')
def test_reportDeletedReference(self): gff = SharedFunctions.Gff() gff.list2gff(TestInsert.gfflist) # Insert sequences into reference ins = Insert.Insert(TestInsert.ref, gff, TestInsert.ies) newfasta, newgff = ins.reportInsertedReference() # print("\n".join(newgff.gff2list())) # testing # Take them out again dels = Insert.Insert(newfasta, newgff, None) delfasta, delgff = dels.reportDeletedReference() # print("\n".join(delgff.gff2list())) # testing # Check that they are the same sequence self.assertEqual( str(delfasta['ctg1'].seq), str(TestInsert.ref['ctg1'].seq)) self.assertEqual( str(delfasta['ctg2'].seq), str(TestInsert.ref['ctg2'].seq)) self.assertEqual( str(delfasta['ctg3'].seq), str(TestInsert.ref['ctg3'].seq)) self.assertEqual(str(delgff.getAttr('ies6', 'ta_pointer_start')), '10') self.assertEqual(str(delgff.getAttr('ies6', 'ta_pointer_end')), '10')
type=int, help="Minimum length to show in histogram of IES lengths (detail)") parser.add_argument( "--hist_len_max", default=400, type=int, help="Maximum length to show in histogram of IES lengths (detail)") parser.add_argument( "--hist_style", default="facet", help="Style for histograms of IES lengths: 'facet', 'bar', or 'barstacked'" ) args = parser.parse_args() # Import GFF records gff_obj = SharedFunctions.Gff() gff_obj.file2gff(args.gff) # Get relevant fields out = [] for gff_id in gff_obj: row = {} row['id'] = gff_id if int(gff_obj.getValue(gff_id, "start")) == int(gff_obj.getValue(gff_id, "end")): row['type'] = "ins" elif int(gff_obj.getValue(gff_id, "start")) < int( gff_obj.getValue(gff_id, "end")): row['type'] = "del" else: raise Exception(f"Invalid GFF3, start > end: check {row['id']}")
def test_report_summary_string(self): inlist = [3, 5, 5, 5, 2, 2] self.assertEqual( SharedFunctions.report_summary_string(inlist, " "), "5*3 2*2 3*1")
def test_mean_of_number_list(self): self.assertEqual( SharedFunctions.mean_of_number_list("12_13_14", "_"), 13)
def miser(args): logger = logging.getLogger("main.miser") run_boilerplate(logger) logger.info("Started BleTIES MISER") # Read input files alnfile, refgenome = read_bam_ref(args.bam, args.ref) # Initialize new IesRecords object to store putative IESs iesrecords = Milraa.IesRecords(alnfile, "bam", refgenome) # Read in IES GFF file produced by Milraa logger.info("Reading GFF file containing putative IESs") iesgff = SharedFunctions.Gff() iesgff.file2gff(args.gff) # Compare mean mismatch percentage of reads with and without putative IES # for each putative IES logger.info(""" Reporting possibly spurious IESs due to misassembly or mapped paralogs """) out_gff_split = defaultdict( list) # dict to hold split GFF file keyed by diagnosis args.out.write("\t".join([ 'ID', 'mean_mismatch_pc_with_indel', 'mean_mismatch_pc_no_indel', 'stdev_with_indel', 'stdev_no_indel', 'statistic', 'p-value', 'num_reads_with_indel', 'num_reads_no_indel', 'diagnosis' ])) args.out.write("\n") for bpid in iesgff: ins_mm, non_mm = iesrecords.reportIndelReadMismatchPc( iesgff.getValue(bpid, 'seqid'), int(iesgff.getValue(bpid, 'start')), int(iesgff.getValue(bpid, 'end')), int(iesgff.getAttr(bpid, 'IES_length')), int(args.min_ies_length)) # Perform test of mismatch % if more than 2 reads with inserts # (otherwise stdev meaningless) if len(ins_mm) > 2 and len(non_mm) > 2: if args.spurious_ies_test == 'mann-whitney': # Mann-Whitney U test for whether mismatch % with indel of interest # is greater than without mwstat, mwpval = mannwhitneyu(ins_mm, non_mm, alternative='greater') else: # Ward's t-test (non-equal population variances) mwstat, mwpval = ttest_ind(ins_mm, non_mm, equal_var=False) # Report outarr = [ bpid, round(stats.mean(ins_mm), 2), round(stats.mean(non_mm), 2), round(stats.stdev(ins_mm), 2), round(stats.stdev(non_mm), 2), round(mwstat, 2), '%.2E' % mwpval, # scientific notation len(ins_mm), len(non_mm) ] # Diagnosis diagnosis = "ok" if stats.mean(ins_mm) > 5 or stats.mean(non_mm) > 5: diagnosis = "high_error" if len(ins_mm) > len(non_mm): diagnosis = 'misassembly' # PVAL_UNCORR = 0.05 # TODO magic number pval_corr = args.spurious_ies_pvalue / \ len(iesgff) # Bonferroni correction if mwpval < pval_corr and stats.mean(ins_mm) > stats.mean(non_mm): diagnosis = "paralog" elif len(non_mm) < 1: outarr = [ bpid, round(stats.mean(ins_mm), 2), "NA", "NA", "NA", "NA", "NA", len(ins_mm), len(non_mm) ] diagnosis = "misassembly" # or scrambling else: outarr = [ bpid, round(stats.mean(ins_mm), 2), round(stats.mean(non_mm), 2), "NA", "NA", "NA", "NA", len(ins_mm), len(non_mm) ] diagnosis = "low_coverage" outarr.append(diagnosis) # Split the input GFF entries into each diagnosis group out_gff_split[diagnosis].append(iesgff.getEntry(bpid)) # Write output args.out.write("\t".join([str(i) for i in outarr])) args.out.write("\n") # args.out.write(" ".join([str(i) for i in ins_mm]) + "\n") # args.out.write(" ".join([str(i) for i in non_mm]) + "\n") # Output split GFF files if args.split_gff: logger.info("Splitting input GFF entries into inferred categories") for diag in out_gff_split: outfile = f"{args.gff}.{diag}.gff3" with open(outfile, "w") as fh_spl: # Write gff version header and comment some info on this file fh_spl.write("##gff-version 3\n") fh_spl.write("# " + " ".join(sys.argv) + "\n") fh_spl.write( f"# BleTIES MISER putative IESs classified as {diag}\n") for line in out_gff_split[diag]: fh_spl.write("\t".join([str(i) for i in line])) fh_spl.write("\n") # Close alignment filehandle alnfile.close() logger.info("Finished MISER")