def test_sj_corr_off(self): """ Splice reference provided, but correction set to off. Expected behavior is to skip SJ ref initialization because it would be a waste of time """ # Initialize options etc. sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" tmp_dir = "scratch/prep_refs/sj_off/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr1.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "false" options.variantFile = None options.sjAnnotFile = "input_files/test_junctions.txt" header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant dicts are empty assert refs.snps == refs.insertions == refs.deletions == {} # Check that SJ bedtools and annot lookup are empty assert refs.donors == refs.acceptors == None assert refs.sjAnnot == set()
def test_noncanonical(self): """ Transcript should be noncanonical and un-annotated prior to correction, but be canonical and annotated afterwards """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" sjFile = "input_files/GM12878_SJs_chr1.tab" tmp_dir = "scratch/test_jIjM/TC_tmp/" chroms = set(["chr1"]) refs = dstruct.Struct() refs.genome = Fasta("input_files/hg38_chr1.fa") refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) with open(sam, 'r') as f: sam_line = f.readline().strip() transcript, logInfo = TC.transcript_init(sam_line, refs.genome, refs.sjAnnot) assert transcript.allJnsAnnotated == False assert transcript.isCanonical == False # Now correct the junction and retest upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo) assert upd_transcript.allJnsAnnotated == True assert upd_transcript.isCanonical == True
def test_variants(self): """ A variant file is provided """ # Initialize options etc. sam = "input_files/vcf_test/read_with_snps.sam" tmp_dir = "scratch/prep_refs/variant/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr11.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "false" options.variantFile = "input_files/vcf_test/snps.vcf" options.sjAnnotFile = None header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant deletion and insertion dicts are empty assert len(refs.insertions) == 0 assert len(refs.deletions) == 0 assert len(refs.snps) > 0 # Check that SJ bedtools and annot lookup are empty assert refs.donors == refs.acceptors == None assert refs.sjAnnot == set()
def test_sjs(self): """ Genome and splice junction reference provided. Variant structs should still be empty. """ # Initialize options etc. sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" tmp_dir = "scratch/prep_refs/sjs/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr1.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "true" options.variantFile = None options.sjAnnotFile = "input_files/test_junctions.txt" header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant dicts are empty assert refs.snps == refs.insertions == refs.deletions == {} # Check SJ bedtools and annot lookup assert (refs.donors).count() == 3 assert (refs.acceptors).count() == 2 # Same acceptor appears in 2 jns assert len(refs.sjAnnot) == 3
def test_genome_only(self): """ Make sure that the prep_refs function works under the simplest possible option setting: no variants or SJs provided. """ # Initialize options etc. sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" tmp_dir = "scratch/prep_refs/genome-only/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr1.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "false" options.variantFile = None options.sjAnnotFile = None header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant dicts are empty assert refs.snps == refs.insertions == refs.deletions == {} # Check that SJ bedtools and annot lookup are empty assert refs.donors == refs.acceptors == None assert refs.sjAnnot == set()
def test_crash_dmel(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is also supposed to crash correction, but did not in TC v2.0.1.""" # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" tmp_dir = "scratch/dmel/TC" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr3R"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/drosophila_example/chr3R.fa") sam = "input_files/drosophila_example/no_SJ_corr.sam" with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) orig_CIGAR = transcript.CIGAR orig_seq = transcript.SEQ orig_MD = transcript.MD expected_TE = "\t".join([ "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013", "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected", "Other" ]) + "\n" assert transcript.isCanonical == False # Attempt to correct the splice junction new_transcript, TE_entries = TC.cleanNoncanonical( transcript, refs, maxDist, logInfo) print(TE_entries) assert new_transcript.isCanonical == False assert TE_entries == expected_TE assert new_transcript.MD == orig_MD assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert new_transcript.CIGAR == orig_CIGAR assert new_transcript.SEQ == orig_seq
def test_crash_correction(self): """ This is a case that is supposed to crash the NCSJ correction process, resulting in no correction. This is because the mapping has created a 7-bp micro-exon with a canonical but likely incorrect junction to its left, and a non-canonical junction on its right. Post-correction, we end up with two introns next to each other with a zero-length exon, which is not valid.""" # Process references sjFile = "input_files/chr11_sjs.txt" tmp_dir = "scratch/test/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr11"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr11.fa") sam = "input_files/sams/microexon.sam" with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N" "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N" "59M864N31M9891N69M1711N7M1341N47M13S") assert transcript.isCanonical == False assert transcript.MD == "MD:Z:2473" assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert transcript.CIGAR == orig_CIGAR
def test_correct_ncsj(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_ncsj/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr1"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot) jnNumber = 0 maxDist = 5 logInfo = TC.init_log_info(sam_fields) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) assert transcript.isCanonical == True assert transcript.spliceJunctions[jnNumber].isCanonical == True assert transcript.SEQ == "AAGGAA" assert transcript.CIGAR == "3M764N3M" assert transcript.MD == "MD:Z:6" assert logInfo.corrected_NC_SJs == 1
def make_novelty_type_struct(database, datasets): """ Create a data structure where it is possible to look up whether a gene or transcript belongs to a particular category of novelty""" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() novelty_type = dstruct.Struct() novelty_type.known_genes = set( qutils.fetch_all_known_genes_detected(cursor, datasets)) novelty_type.antisense_genes = set( qutils.fetch_antisense_genes(cursor, datasets)) novelty_type.intergenic_genes = set( qutils.fetch_intergenic_novel_genes(cursor, datasets)) novelty_type.known_transcripts = set( qutils.fetch_all_known_transcripts_detected(cursor, datasets)) novelty_type.ISM_transcripts = set( qutils.fetch_all_ISM_transcripts(cursor, datasets)) novelty_type.ISM_prefix = set( qutils.fetch_prefix_ISM_transcripts(cursor, datasets)) novelty_type.ISM_suffix = set( qutils.fetch_suffix_ISM_transcripts(cursor, datasets)) novelty_type.NIC_transcripts = set( qutils.fetch_NIC_transcripts(cursor, datasets)) novelty_type.NNC_transcripts = set( qutils.fetch_NNC_transcripts(cursor, datasets)) novelty_type.antisense_transcripts = set( qutils.fetch_antisense_transcripts(cursor, datasets)) novelty_type.intergenic_transcripts = set( qutils.fetch_intergenic_transcripts(cursor, datasets)) novelty_type.genomic_transcripts = set( qutils.fetch_genomic_transcripts(cursor, datasets)) conn.close() return novelty_type
def test_DIM_nc(self): """ Correct a transcript containing a deletion, insertion, mismatch, and noncanonical splice junction """ # Initialize options etc. sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" genome = Fasta("input_files/hg38_chr1.fa") sjFile = "input_files/GM12878_SJs_chr1.tab" tmp_dir = "scratch/example/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) outfiles = dstruct.Struct() outfiles.TElog = open(tmp_dir + "DIM_nc_clean.TE.log", 'w') outfiles.sam = open(tmp_dir + "DIM_nc_clean.sam", 'w') outfiles.fasta = open(tmp_dir + "DIM_nc_clean.fasta", 'w') outfiles.log = open(tmp_dir + "DIM_nc_clean.log", 'w') refs = dstruct.Struct() refs.sjAnnot = sjAnnot refs.genome = genome refs.donors = donors refs.acceptors = acceptors refs.snps = {} refs.deletions = {} refs.insertions = {} options = dstruct.Struct() options.maxLenIndel = 5 options.maxSJOffset = 5 options.correctMismatches = "true" options.correctIndels = "true" options.correctSJs = "true" options.primaryOnly = True options.canonOnly = False # Correct the transcript with open(sam, 'r') as f: transcripts = [f.readline().strip()] TC.batch_correct(transcripts, options, refs, outfiles) # Close the output files for handle in outfiles.values(): handle.close() # Expected transcript attributes post-correction correct_CIGAR = ("12M1134N126M163N202M866N74M924N191M1777N127M2109N" "157M88N159M932N633M274N117M7696N170M1215N629M938N" "29M428N133M254N166M390N212M253N89M163N483M") correct_MD = "MD:Z:3709" correct_NM = "NM:i:0" correct_jI = ( "jI:B:i,150941429,150942562,150942689,150942851,150943054," "150943919,150943994,150944917,150945109,150946885,150947013," "150949121,150949279,150949366,150949526,150950457,150951091," "150951364,150951482,150959177,150959348,150960562,150961192," "150962129,150962159,150962586,150962720,150962973,150963140," "150963529,150963742,150963994,150964084,150964246") correct_jM = "jM:B:c,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21" # Read in transcript from outfile with open(tmp_dir + "DIM_nc_clean.sam", 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjAnnot) assert transcript.CIGAR == correct_CIGAR assert transcript.MD == correct_MD assert transcript.NM == correct_NM assert transcript.jI == correct_jI assert transcript.jM == correct_jM # Read logs and make sure they are OK expected_log = "\t".join([ "c34150/f1p1/3707", "primary", "2", "0", "0", "1", "0", "0", "2", "0", "1", "0" ]) with open(tmp_dir + "DIM_nc_clean.log", 'r') as f: log = f.readline().strip() assert log == expected_log