def exclude_and_sort(homeDir): """ Input: Regions.03.txt Output: Regions.04.txt Exclude recs with NumCodons <= 8, because SvmScore was weird for these, presumably because of insufficient training vectors of that size. Exclude recs with SvmAntisenseScore < 0.3. That would keep around 99% of CodingOverlap regions, and exclude around 94% of AntisenseOverlap regions (in hg38). Keep only NoOverlap and Extension recs, since we only used the others for training. Sort by decreasing SvmScore. Add 1-based Rank field. """ inFileName = get_input_fileName(homeDir, 3) err_msg('Reading %s.' % inFileName) inDFR = get_reader(inFileName) inRecs = list(inDFR) outFileName = get_output_fileName(homeDir, 4) outRecs = [ rec for rec in inRecs if rec.RegType in [NoOverlap, Extension] and rec.NumCodons >= MinCodons and rec.SvmAntisenseScore >= 0.3 ] outRecs.sort(key=lambda rec: -rec.SvmScore) outDFW = DFW(outFileName, [Rank] + inDFR.get_fieldNames()) for index, rec in enumerate(outRecs): rec.Rank = index + 1 outDFW.write_line(rec) outDFW.close()
def make_bed(homeDir): """ Input: Regions.04.txt Output: PhyloCSFNovel.bed. Output PhyloCSF Candidate Coding Regions in bed format for browser tracks. """ err_msg('Writing .bed file.') inFileName = get_input_fileName(homeDir, 4) novelFileName = pjoin(homeDir, 'PhyloCSFNovel.bed') recs = list(get_reader(inFileName)) """ Color regions on +/- strands green/red to match PhyloCSF tracks, and dim ones with higher ranks. Ranks matter more at the start so use a logarithmicish scale. Put the middle of the range at the somewhat arbitrary rank 5000, which is sort of where they aren't as useful. UCSC says to limit to 8 colors to keep browser working well. """ numBins = 8 midInd = 5000 colorStrs = {'+': '0,175,0', '-': '200,0,0'} numRecs = len(recs) # Slightly more than the largest index def scale_rank(recInd): # 0 -> 0, midInd -> 0.5, numRecs - 1 -> 1 - epsilon a = (numRecs - 2 * midInd) / midInd**2 return math.log(1 + a * recInd) / math.log(1 + a * numRecs) def color_str(recInd, strand): binInd = (int)(numBins * scale_rank(recInd)) # bin 0 -> colorStrs. # bin numBins -> white = (255,255,255) (never happens cause bin < numBins) fullRGB = map(int, colorStrs[strand].split(',')) whiteRGB = (255, 255, 255) return ','.join( '%d' % (fullRGB[ii] * (1 - binInd / numBins) + whiteRGB[ii] * binInd / numBins) for ii in range(3)) with myopen(novelFileName, 'w') as novelFile: for recInd, rec in enumerate(recs): chrom = rec.Chrom bedLine = intervals_to_bed_line(chrom, [(rec.Start, rec.End)], rec.Strand, recInd + 1, rec.Start, rec.End, color=color_str( recInd, rec.Strand)) print(bedLine, file=novelFile)
def add_phyloCSF_fields(homeDir): """ Input: Regions.01.txt Regions.pcsf.out Output: Regions.02.txt Fill fields ScorePerCodon, NumCodons, Bls, AntiScorePerCodon, and ScoreDiff """ inFileName = get_input_fileName(homeDir, 1) err_msg('Reading %s.' % inFileName) inDFR = get_reader(inFileName) inRecs = list(inDFR) err_msg('Reading PhyloCSF scores.') pcsfDict = _get_pcsf_dict(homeDir) err_msg('Setting PhyloCSF-related fields.') _fill_mle_pcsf_fields(inRecs, pcsfDict) outFileName = get_output_fileName(homeDir, 2) err_msg('Writing %s.' % outFileName) outDFW = DFW(outFileName, inDFR.get_fieldNames()) for rec in inRecs: outDFW.write_line(rec) outDFW.close()
def splice_example(): donorPredictor = DonorPredictor( 'ExampleFiles/SpliceExample/Hsap.donor.mecoef') acceptorPredictor = AcceptorPredictor( 'ExampleFiles/SpliceExample/Hsap.acceptor.mecoef') # Predict donor score for TCA-GT-AAGG bases = 'TCAGTAAGG' score = donorPredictor(bases[0:3], bases[5:9]) assert abs(score - 5.685921922927156) < 1e-9, score # Predict acceptor score for CAATGGTTAGTTTCAGTA-AG-GAA bases = 'CAATGGTTAGTTTCAGTAAGGAA' score = acceptorPredictor(bases[0:18], bases[20:23]) assert abs(score - -3.033034001453567) < 1e-9, score err_msg('\nYay! Results match precomputed values.')
def svm_example(): pushd(os.path.dirname(__file__)) homeDir = 'ExampleFiles/SVMexample' regionsDir = homeDir # Where previously computed PhyloCSF Regions bed files are # The following bed files have all GENCODE v29 coding and pseudogene transcripts on chr17 codingBedFileName = 'ExampleFiles/SVMexample/CodingTranscripts.GENCODEv29.chr17.bed' pseudoBedFileName = 'ExampleFiles/SVMexample/PseudogeneTranscripts.GENCODEv29.chr17.bed' # Classify regions based on overlap with annotations and write input file for PhyloCSF classify_regions(homeDir, regionsDir, codingBedFileName, pseudoBedFileName) # Simulate running PhyloCSF with strategy=mle by copying the resulting file err_msg( 'Simulating running PhyloCSF with strategy=mle by copying the resulting file' ) cp('ExampleFiles/SVMexample/SimulatedPhyloCSFOutput/Regions.pcsf.out', 'ExampleFiles/SVMexample') # Fill in PhyloCSF fields, train and run SVMs, prune and sort regions, make bed file do_other_steps( homeDir, 100) # Real SVM used 10000 training vectors, but this is faster for fileName in [ 'Regions.01.txt', 'Regions.02.txt', 'Regions.04.txt', 'Regions.pcsf.in', 'Regions.pcsf.out', 'PhyloCSFNovel.bed' ]: assert equal_files(pjoin('ExampleFiles/SVMexample', fileName), pjoin('ExampleFiles/Results', fileName)), 'Files differ: %s' % fileName rm(pjoin('ExampleFiles/SVMexample', fileName)) # Some values in Regions.03.txt differ slightly on linux and mac, so can't use equal_files. for lineNew, lineOld in izip_longest( get_reader('ExampleFiles/SVMexample/Regions.03.txt'), get_reader('ExampleFiles/Results/Regions.03.txt')): for key in lineNew.keys(): if isinstance(lineNew[key], float): assert abs(lineNew[key] - lineOld[key]) < 1e-9 else: assert lineNew[key] == lineOld[key] rm('ExampleFiles/SVMexample/Regions.03.txt') err_msg('\nYay! Result files match precomputed files.') popd()
def hmm_example(): "Calculate HMM parameters and then run HMM for one reading frame of one scaffold" pushd(os.path.dirname(__file__)) # This file has information about every annotated coding exon in human GENCODE v29 codingExonsFileName = 'ExampleFiles/EstimateHMMparamsExample/HumanCodingExonsV29.txt' humanGenomeLength = 3252208893 # Sum of chromosome/scaffold lengths in hg38 assembly err_msg('Estimating HMM parameters.') humanHMMparams = estimate_hmm_params_for_genome(codingExonsFileName, humanGenomeLength) print( 'Here are the HMM parameters computed for the Human Genome from GENCODE v29.' ) print(humanHMMparams) print() """ (0.0018352187276509814, 56.07319200374541, [0.6533594654727364, 0.20618155683835557, 0.140458977688905], [3299.5676063682404, 118314.79205686758, 314.61620563310396]) """ print( 'For comparison, here are the ones used for the paper (based on v16).') print(HMMparams['Human']) print() """ (0.0018577729491349902, 56.83017481932195, [0.6417673350475721, 0.2150725662530122, 0.1431600986994177], [3376.1207312280044, 122153.58921094926, 328.20142626163494]) """ # The result of running PhyloCSF on every codon in frame 1 on the minus # strand of chr4_GL000008v2_random is in # ExampleFiles/HMMexample/chr4_GL000008v2_random.Strand-.Frame1.fixed.out phyloCSFoutputDir = phyloCSFregionDir = 'ExampleFiles/HMMexample' err_msg('Computing PhyloCSF Regions.') create_PhyloCSF_Regions(humanHMMparams, phyloCSFoutputDir, phyloCSFregionDir, chrom='chr4_GL000008v2_random', strand='-', frame=1) phyloCSFregionFileName = 'ExampleFiles/HMMexample/chr4_GL000008v2_random.Strand-.Frame1.coding.bed' assert equal_files( phyloCSFregionFileName, 'ExampleFiles/Results/chr4_GL000008v2_random.Strand-.Frame1.coding.bed' ) err_msg('\nYay! Result file matches precomputed file.') rm(phyloCSFregionFileName) popd()
def add_svm_scores(homeDir, numTraining=None): """ Input: Regions.02.txt Output: Regions.03.txt Add fields SvmAntisenseScore, SvmScore """ if numTraining == None: numTraining = 10000 inFileName = get_input_fileName(homeDir, 2) err_msg('Reading %s.' % inFileName) inDFR = get_reader(inFileName) inRecs = list(inDFR) err_msg('Getting training recs.') codingTrainingRecs, antiTrainingRecs, otherTrainingRecs = \ _get_svm_training_recs(inRecs, numTraining) for features, svmField, trainingRecs in [ ([NumCodons, ScorePerCodon, ScoreDiff], SvmAntisenseScore, [codingTrainingRecs, antiTrainingRecs]), ([NumCodons, ScorePerCodon, Bls, ScoreDiff], SvmScore, [codingTrainingRecs, otherTrainingRecs]), ]: err_msg('Making training and test vectors for %s.' % svmField) trainingClasses = [1] * len(trainingRecs[0]) + [0] * len( trainingRecs[1]) trainingVecs = [[rec[feature] for feature in features] for rec in trainingRecs[0] + trainingRecs[1]] testVecs = [[rec[feature] for feature in features] for rec in inRecs] err_msg('Training and using SVM for %s.' % svmField) svmProbs = ClassSVM.ClassSVM(trainingVecs, trainingClasses)(testVecs) err_msg('Setting %s fields.' % svmField) for rec, prob in zip(inRecs, svmProbs): rec[svmField] = prob outFileName = get_output_fileName(homeDir, 3) err_msg('Writing %s.' % outFileName) outDFW = DFW(outFileName, inDFR.get_fieldNames()) for rec in inRecs: outDFW.write_line(rec) outDFW.close()
def classify_regions(homeDir, regionsDir, codingBedFileName, pseudoBedFileName): """ Inputs: - PhyloCSF Regions bed files in regionsDir, produced by the PhyloCSF HMM. The file names should have the format: {CHROMOSOME}.Strand{STRAND}.Frame{FRAME}.coding.bed.gz where STRAND is + or - and FRAME is 0, 1, or 2. - Coding and pseudogene annotated transcripts. Output: Regions.01.txt or Regions.01.txt Find overlapping transcripts, classify regions, and create extension regions. Also create input file for running PhyloCSF using the strategy=mle option. """ assure_dir(homeDir) outFileName = get_output_fileName(homeDir, 1) err_msg('Reading transcripts') codingTrs = [bed_line_to_tr(line) for line in myopen(codingBedFileName)] pseudoTrs = [bed_line_to_tr(line) for line in myopen(pseudoBedFileName)] err_msg('Creating overlap checkers') codingOverlapChecker = OverlapChecker(codingTrs, onlyCDS=True) pseudoOverlapChecker = OverlapChecker(pseudoTrs, onlyCDS=False) outRecs = [] lineCounter = plusCounter = minusCounter = 0 def get_rec_name(rec): return '%s:%d-%d%s' % (rec.Chrom, rec.Start, rec.End, rec.Strand) err_msg('Processing input PhyloCSF Regions.') for bedFileName in ls(regionsDir): if not bedFileName.endswith('.coding.bed.gz'): continue for line in myopen(pjoin(regionsDir, bedFileName)): lineCounter += 1 if '.Strand+' in bedFileName: plusCounter += 1 else: minusCounter += 1 dummyName, chrom, bedIntervals, strand = bed_line_to_intervals( line) assert len(bedIntervals) == 1 interval = bedIntervals[0] rec = DictClass() rec.Chrom = chrom rec.Start = interval[0] rec.End = interval[1] rec.NumCodons = (rec.End - rec.Start + 1) // 3 rec.Strand = strand rec.Name = get_rec_name(rec) rec.Parent = 'NA' assert (rec.End - rec.Start + 1) % 3 == 0, rec.Name # Find overlaps and set up RegType codingOverlapTrs = codingOverlapChecker.overlapping_trs( rec.Chrom, '+-', interval) pseudoOverlapTrs = pseudoOverlapChecker.overlapping_trs( rec.Chrom, '+-', interval) if len(pseudoOverlapTrs) > 0: rec.RegType = PseudoOverlap elif any_same_frame(interval, rec.Strand, codingOverlapTrs): rec.RegType = CodingOverlap elif any_anti_frame(interval, rec.Strand, codingOverlapTrs): rec.RegType = AntisenseOverlap else: rec.RegType = NoOverlap outRecs.append(rec) # Subtract from intervals overlaps with pseudogenes in any frame or # coding in same or antisense frame. # Make a new record for each resulting segment if rec.RegType != NoOverlap: intervals = subtract_trs(interval, rec.Strand, pseudoOverlapTrs, codingOverlapTrs) for interval in intervals: subRec = DictClass() subRec.RegType = 'Extension' subRec.Chrom = rec.Chrom subRec.Start = interval[0] subRec.End = interval[1] subRec.NumCodons = (subRec.End - subRec.Start + 1) // 3 subRec.Strand = rec.Strand subRec.Name = get_rec_name(subRec) subRec.Parent = rec.Name assert (subRec.End - subRec.Start + 1) % 3 == 0, subRec.Name outRecs.append(subRec) outRecs.sort(key=lambda rec: (RegTypes.index(rec.RegType), rec.Chrom, rec. Strand, rec.Start, rec.End)) fields = list(Fields) fields.remove(Rank) # We'll insert it at the beginning later outDFW = DFW(outFileName, fields) for recInd, rec in enumerate(outRecs): outDFW.write_line(rec) outDFW.close() err_msg('Writing PhyloCSF input file.') _write_phyloCSF_in(homeDir)