def exclude_and_sort(homeDir):
    """
    Input:  Regions.03.txt
    Output: Regions.04.txt
    Exclude recs with NumCodons <= 8, because SvmScore was weird for these,
        presumably because of insufficient training vectors of that size.
    Exclude recs with SvmAntisenseScore < 0.3. That would keep around 99% of CodingOverlap
        regions, and exclude around 94% of AntisenseOverlap regions (in hg38).
    Keep only NoOverlap and Extension recs, since we only used the others for training.
    Sort by decreasing SvmScore.
    Add 1-based Rank field.
    """
    inFileName = get_input_fileName(homeDir, 3)
    err_msg('Reading %s.' % inFileName)
    inDFR = get_reader(inFileName)
    inRecs = list(inDFR)
    outFileName = get_output_fileName(homeDir, 4)

    outRecs = [
        rec for rec in inRecs if rec.RegType in [NoOverlap, Extension]
        and rec.NumCodons >= MinCodons and rec.SvmAntisenseScore >= 0.3
    ]
    outRecs.sort(key=lambda rec: -rec.SvmScore)

    outDFW = DFW(outFileName, [Rank] + inDFR.get_fieldNames())
    for index, rec in enumerate(outRecs):
        rec.Rank = index + 1
        outDFW.write_line(rec)
    outDFW.close()
def make_bed(homeDir):
    """
    Input: Regions.04.txt
    Output: PhyloCSFNovel.bed.
    Output PhyloCSF Candidate Coding Regions in bed format for browser tracks.
    
    """
    err_msg('Writing .bed file.')
    inFileName = get_input_fileName(homeDir, 4)
    novelFileName = pjoin(homeDir, 'PhyloCSFNovel.bed')
    recs = list(get_reader(inFileName))
    """
    Color regions on +/- strands green/red to match PhyloCSF tracks, and dim ones with
       higher ranks. Ranks matter more at the start so use a logarithmicish scale. Put the
       middle of the range at the somewhat arbitrary rank 5000, which is sort of where
       they aren't as useful.
    UCSC says to limit to 8 colors to keep browser working well.
    """
    numBins = 8
    midInd = 5000

    colorStrs = {'+': '0,175,0', '-': '200,0,0'}

    numRecs = len(recs)  # Slightly more than the largest index

    def scale_rank(recInd):
        # 0 -> 0, midInd -> 0.5, numRecs - 1 -> 1 - epsilon
        a = (numRecs - 2 * midInd) / midInd**2
        return math.log(1 + a * recInd) / math.log(1 + a * numRecs)

    def color_str(recInd, strand):
        binInd = (int)(numBins * scale_rank(recInd))
        # bin 0 -> colorStrs.
        # bin numBins -> white = (255,255,255) (never happens cause bin < numBins)
        fullRGB = map(int, colorStrs[strand].split(','))
        whiteRGB = (255, 255, 255)
        return ','.join(
            '%d' % (fullRGB[ii] *
                    (1 - binInd / numBins) + whiteRGB[ii] * binInd / numBins)
            for ii in range(3))

    with myopen(novelFileName, 'w') as novelFile:
        for recInd, rec in enumerate(recs):
            chrom = rec.Chrom
            bedLine = intervals_to_bed_line(chrom, [(rec.Start, rec.End)],
                                            rec.Strand,
                                            recInd + 1,
                                            rec.Start,
                                            rec.End,
                                            color=color_str(
                                                recInd, rec.Strand))
            print(bedLine, file=novelFile)
def add_phyloCSF_fields(homeDir):
    """
    Input:  Regions.01.txt
            Regions.pcsf.out
    Output: Regions.02.txt
    Fill fields ScorePerCodon, NumCodons, Bls,
        AntiScorePerCodon, and ScoreDiff
    """
    inFileName = get_input_fileName(homeDir, 1)
    err_msg('Reading %s.' % inFileName)
    inDFR = get_reader(inFileName)
    inRecs = list(inDFR)

    err_msg('Reading PhyloCSF scores.')
    pcsfDict = _get_pcsf_dict(homeDir)

    err_msg('Setting PhyloCSF-related fields.')
    _fill_mle_pcsf_fields(inRecs, pcsfDict)

    outFileName = get_output_fileName(homeDir, 2)
    err_msg('Writing %s.' % outFileName)
    outDFW = DFW(outFileName, inDFR.get_fieldNames())
    for rec in inRecs:
        outDFW.write_line(rec)
    outDFW.close()
Exemplo n.º 4
0
def splice_example():
    donorPredictor = DonorPredictor(
        'ExampleFiles/SpliceExample/Hsap.donor.mecoef')
    acceptorPredictor = AcceptorPredictor(
        'ExampleFiles/SpliceExample/Hsap.acceptor.mecoef')

    # Predict donor score for TCA-GT-AAGG
    bases = 'TCAGTAAGG'
    score = donorPredictor(bases[0:3], bases[5:9])
    assert abs(score - 5.685921922927156) < 1e-9, score

    # Predict acceptor score for CAATGGTTAGTTTCAGTA-AG-GAA
    bases = 'CAATGGTTAGTTTCAGTAAGGAA'
    score = acceptorPredictor(bases[0:18], bases[20:23])
    assert abs(score - -3.033034001453567) < 1e-9, score

    err_msg('\nYay! Results match precomputed values.')
Exemplo n.º 5
0
def svm_example():
    pushd(os.path.dirname(__file__))
    homeDir = 'ExampleFiles/SVMexample'
    regionsDir = homeDir  # Where previously computed PhyloCSF Regions bed files are

    # The following bed files have all GENCODE v29 coding and pseudogene transcripts on chr17
    codingBedFileName = 'ExampleFiles/SVMexample/CodingTranscripts.GENCODEv29.chr17.bed'
    pseudoBedFileName = 'ExampleFiles/SVMexample/PseudogeneTranscripts.GENCODEv29.chr17.bed'

    # Classify regions based on overlap with annotations and write input file for PhyloCSF
    classify_regions(homeDir, regionsDir, codingBedFileName, pseudoBedFileName)

    # Simulate running PhyloCSF with strategy=mle by copying the resulting file
    err_msg(
        'Simulating running PhyloCSF with strategy=mle by copying the resulting file'
    )
    cp('ExampleFiles/SVMexample/SimulatedPhyloCSFOutput/Regions.pcsf.out',
       'ExampleFiles/SVMexample')

    # Fill in PhyloCSF fields, train and run SVMs, prune and sort regions, make bed file
    do_other_steps(
        homeDir,
        100)  # Real SVM used 10000 training vectors, but this is faster

    for fileName in [
            'Regions.01.txt', 'Regions.02.txt', 'Regions.04.txt',
            'Regions.pcsf.in', 'Regions.pcsf.out', 'PhyloCSFNovel.bed'
    ]:
        assert equal_files(pjoin('ExampleFiles/SVMexample', fileName),
                           pjoin('ExampleFiles/Results',
                                 fileName)), 'Files differ: %s' % fileName
        rm(pjoin('ExampleFiles/SVMexample', fileName))
    # Some values in Regions.03.txt differ slightly on linux and mac, so can't use equal_files.
    for lineNew, lineOld in izip_longest(
            get_reader('ExampleFiles/SVMexample/Regions.03.txt'),
            get_reader('ExampleFiles/Results/Regions.03.txt')):
        for key in lineNew.keys():
            if isinstance(lineNew[key], float):
                assert abs(lineNew[key] - lineOld[key]) < 1e-9
            else:
                assert lineNew[key] == lineOld[key]
    rm('ExampleFiles/SVMexample/Regions.03.txt')

    err_msg('\nYay! Result files match precomputed files.')
    popd()
Exemplo n.º 6
0
def hmm_example():
    "Calculate HMM parameters and then run HMM for one reading frame of one scaffold"

    pushd(os.path.dirname(__file__))

    # This file has information about every annotated coding exon in human GENCODE v29
    codingExonsFileName = 'ExampleFiles/EstimateHMMparamsExample/HumanCodingExonsV29.txt'

    humanGenomeLength = 3252208893  # Sum of chromosome/scaffold lengths in hg38 assembly
    err_msg('Estimating HMM parameters.')
    humanHMMparams = estimate_hmm_params_for_genome(codingExonsFileName,
                                                    humanGenomeLength)

    print(
        'Here are the HMM parameters computed for the Human Genome from GENCODE v29.'
    )
    print(humanHMMparams)
    print()
    """
    (0.0018352187276509814, 56.07319200374541,
        [0.6533594654727364, 0.20618155683835557, 0.140458977688905],
        [3299.5676063682404, 118314.79205686758, 314.61620563310396])
    """

    print(
        'For comparison, here are the ones used for the paper (based on v16).')
    print(HMMparams['Human'])
    print()
    """
    (0.0018577729491349902, 56.83017481932195,
        [0.6417673350475721, 0.2150725662530122, 0.1431600986994177],
        [3376.1207312280044, 122153.58921094926, 328.20142626163494])
    """

    # The result of running PhyloCSF on every codon in frame 1 on the minus
    # strand of chr4_GL000008v2_random is in
    # ExampleFiles/HMMexample/chr4_GL000008v2_random.Strand-.Frame1.fixed.out

    phyloCSFoutputDir = phyloCSFregionDir = 'ExampleFiles/HMMexample'

    err_msg('Computing PhyloCSF Regions.')
    create_PhyloCSF_Regions(humanHMMparams,
                            phyloCSFoutputDir,
                            phyloCSFregionDir,
                            chrom='chr4_GL000008v2_random',
                            strand='-',
                            frame=1)

    phyloCSFregionFileName = 'ExampleFiles/HMMexample/chr4_GL000008v2_random.Strand-.Frame1.coding.bed'
    assert equal_files(
        phyloCSFregionFileName,
        'ExampleFiles/Results/chr4_GL000008v2_random.Strand-.Frame1.coding.bed'
    )
    err_msg('\nYay! Result file matches precomputed file.')
    rm(phyloCSFregionFileName)
    popd()
def add_svm_scores(homeDir, numTraining=None):
    """
    Input:  Regions.02.txt
    Output: Regions.03.txt
    Add fields SvmAntisenseScore, SvmScore
    """
    if numTraining == None:
        numTraining = 10000
    inFileName = get_input_fileName(homeDir, 2)
    err_msg('Reading %s.' % inFileName)
    inDFR = get_reader(inFileName)
    inRecs = list(inDFR)

    err_msg('Getting training recs.')
    codingTrainingRecs, antiTrainingRecs, otherTrainingRecs = \
        _get_svm_training_recs(inRecs, numTraining)

    for features, svmField, trainingRecs in [
        ([NumCodons, ScorePerCodon, ScoreDiff], SvmAntisenseScore,
         [codingTrainingRecs, antiTrainingRecs]),
        ([NumCodons, ScorePerCodon, Bls,
          ScoreDiff], SvmScore, [codingTrainingRecs, otherTrainingRecs]),
    ]:
        err_msg('Making training and test vectors for %s.' % svmField)
        trainingClasses = [1] * len(trainingRecs[0]) + [0] * len(
            trainingRecs[1])
        trainingVecs = [[rec[feature] for feature in features]
                        for rec in trainingRecs[0] + trainingRecs[1]]
        testVecs = [[rec[feature] for feature in features] for rec in inRecs]

        err_msg('Training and using SVM for %s.' % svmField)
        svmProbs = ClassSVM.ClassSVM(trainingVecs, trainingClasses)(testVecs)

        err_msg('Setting %s fields.' % svmField)
        for rec, prob in zip(inRecs, svmProbs):
            rec[svmField] = prob

    outFileName = get_output_fileName(homeDir, 3)
    err_msg('Writing %s.' % outFileName)
    outDFW = DFW(outFileName, inDFR.get_fieldNames())
    for rec in inRecs:
        outDFW.write_line(rec)
    outDFW.close()
def classify_regions(homeDir, regionsDir, codingBedFileName,
                     pseudoBedFileName):
    """
    Inputs: - PhyloCSF Regions bed files in regionsDir, produced by the PhyloCSF HMM.
                  The file names should have the format:
                      {CHROMOSOME}.Strand{STRAND}.Frame{FRAME}.coding.bed.gz
                  where STRAND is + or - and FRAME is 0, 1, or 2.
            - Coding and pseudogene annotated transcripts.
    Output: Regions.01.txt or Regions.01.txt
    Find overlapping transcripts, classify regions, and create extension regions.
    Also create input file for running PhyloCSF using the strategy=mle option.
    """
    assure_dir(homeDir)
    outFileName = get_output_fileName(homeDir, 1)
    err_msg('Reading transcripts')
    codingTrs = [bed_line_to_tr(line) for line in myopen(codingBedFileName)]
    pseudoTrs = [bed_line_to_tr(line) for line in myopen(pseudoBedFileName)]

    err_msg('Creating overlap checkers')
    codingOverlapChecker = OverlapChecker(codingTrs, onlyCDS=True)
    pseudoOverlapChecker = OverlapChecker(pseudoTrs, onlyCDS=False)

    outRecs = []
    lineCounter = plusCounter = minusCounter = 0

    def get_rec_name(rec):
        return '%s:%d-%d%s' % (rec.Chrom, rec.Start, rec.End, rec.Strand)

    err_msg('Processing input PhyloCSF Regions.')
    for bedFileName in ls(regionsDir):
        if not bedFileName.endswith('.coding.bed.gz'):
            continue
        for line in myopen(pjoin(regionsDir, bedFileName)):
            lineCounter += 1
            if '.Strand+' in bedFileName:
                plusCounter += 1
            else:
                minusCounter += 1
            dummyName, chrom, bedIntervals, strand = bed_line_to_intervals(
                line)
            assert len(bedIntervals) == 1
            interval = bedIntervals[0]
            rec = DictClass()
            rec.Chrom = chrom
            rec.Start = interval[0]
            rec.End = interval[1]
            rec.NumCodons = (rec.End - rec.Start + 1) // 3
            rec.Strand = strand
            rec.Name = get_rec_name(rec)
            rec.Parent = 'NA'
            assert (rec.End - rec.Start + 1) % 3 == 0, rec.Name

            # Find overlaps and set up RegType
            codingOverlapTrs = codingOverlapChecker.overlapping_trs(
                rec.Chrom, '+-', interval)
            pseudoOverlapTrs = pseudoOverlapChecker.overlapping_trs(
                rec.Chrom, '+-', interval)
            if len(pseudoOverlapTrs) > 0:
                rec.RegType = PseudoOverlap
            elif any_same_frame(interval, rec.Strand, codingOverlapTrs):
                rec.RegType = CodingOverlap
            elif any_anti_frame(interval, rec.Strand, codingOverlapTrs):
                rec.RegType = AntisenseOverlap
            else:
                rec.RegType = NoOverlap
            outRecs.append(rec)

            # Subtract from intervals overlaps with pseudogenes in any frame or
            #     coding in same or antisense frame.
            # Make a new record for each resulting segment
            if rec.RegType != NoOverlap:
                intervals = subtract_trs(interval, rec.Strand,
                                         pseudoOverlapTrs, codingOverlapTrs)
                for interval in intervals:
                    subRec = DictClass()
                    subRec.RegType = 'Extension'
                    subRec.Chrom = rec.Chrom
                    subRec.Start = interval[0]
                    subRec.End = interval[1]
                    subRec.NumCodons = (subRec.End - subRec.Start + 1) // 3
                    subRec.Strand = rec.Strand
                    subRec.Name = get_rec_name(subRec)
                    subRec.Parent = rec.Name
                    assert (subRec.End - subRec.Start +
                            1) % 3 == 0, subRec.Name
                    outRecs.append(subRec)

    outRecs.sort(key=lambda rec: (RegTypes.index(rec.RegType), rec.Chrom, rec.
                                  Strand, rec.Start, rec.End))

    fields = list(Fields)
    fields.remove(Rank)  # We'll insert it at the beginning later
    outDFW = DFW(outFileName, fields)
    for recInd, rec in enumerate(outRecs):
        outDFW.write_line(rec)
    outDFW.close()

    err_msg('Writing PhyloCSF input file.')
    _write_phyloCSF_in(homeDir)