Пример #1
0
def do_find_repeats(alignment_file, paramSeq, model, mathType, _stats, separator):
    modelParam = {
        "mathType": mathType,
        "modelFactory": model,
    } 
    driver = TRFDriver()
    trf_repeats = driver.run(alignment_file, paramSeq)
    alignments = Fasta.load(alignment_file, separator, Alignment)
    D = dict()
    stats = defaultdict(int)
    count = 1;
    for alignment in alignments:
        print ("Annotating alignment {0}".format(count))
        count += 1
        consensus_list = list(set([
            x.consensus for x in itertools.chain(*[
                trf_repeats[name] for name in alignment.names
            ])
        ]))
        repeats = find_repeats_in_alignment(alignment, consensus_list, modelParam)
        print repeats
        D.update(repeats)
        if _stats != None:
            s = compute_statistics(repeats)
            for key, value in s.iteritems():
                stats[key] += value
    return D, stats
Пример #2
0
def main(input_file, output_file, trf):
    
    # THIS IS ONLY GENERATOR!!!
    alns = (Alignment(a) 
            for a in Fasta.load(input_file, '[.][0-9]+$', Alignment))
    
    # 1. run trf, 
    for trf_executable in trf:
        if os.path.exists(trf_executable):  
            trf = TRFDriver(trf_executable)
            break
    repeats = trf.run(input_file)
    
    
    
    A = list(compute_annotation_track(alns, repeats))
    json.dump(A, Open(output_file, 'w'), indent=4)
Пример #3
0
def main(files, trf, alignment_regexp, sequence_regexp):
    output_files = {
        'emission': [],
        'transition': [],
        'trf_length': [],
        'trf_consensus': [],
        'trf_fulllength': [],
        'trf_cover': [],
    }
    for filename in files:
        # AggregateAnnotation
        em_file = filename + '.emission.stat'
        tr_file = filename + '.transition.stat'
        le_file = filename + '.trf_length.stat'
        co_file = filename + '.trf_consensus.stat'
        lef_file = filename + '.trf_fulllength.stat'
        cover_file = filename + '.trf_cover.stat'
        AggregateAnnotation(
            filename, 0, 1,
            em_file,
            tr_file
        )
        # Run TRF
        TRF = TRFDriver(trf)
        trf_output_filename = TRF.run(filename, dont_parse=True)
        # Aggregate TRF output
        AggregateTRFOutput(
            trf_output_filename,
            le_file,
            co_file,
            lef_file,
        )
        TrfCover(filename, cover_file, alignment_regexp, sequence_regexp, trf)
        output_files['emission'].append(em_file)
        output_files['transition'].append(tr_file)
        output_files['trf_length'].append(le_file)
        output_files['trf_consensus'].append(co_file)
        output_files['trf_fulllength'].append(co_file)
        output_files['trf_cover'].append(cover_file)
    return output_files
Пример #4
0
def compute_annotations(args, alignment_filename, model):
    annotations = dict()
    if 'trf' in args.tracks:
        trf = None
        for trf_executable in args.trf:
            if os.path.exists(trf_executable):
                trf = TRFDriver(trf_executable, mathType=args.mathType)
                #break
        if trf:
            repeats = trf.run(alignment_filename)
            annotations['trf'] = repeats
                        
    if 'original_repeats' in args.tracks:
        repeats = json.load(Open(alignment_filename + '.repeats',
                                 'r'))
        for k, v in repeats.iteritems():
            repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) 
                          for _v in v]
        
        annotations['original_repeats'] = repeats

    if 'trf_cons' in args.tracks:
        trf = None
        for trf_executable in args.trf:
            if os.path.exists(trf_executable):
                trf = TRFDriver(trf_executable, mathType=args.mathType)
                #break
        if trf:
            repeats = trf.run(alignment_filename)
        #    repeats = json.load(Open(alignment_filename + '.repeats',
        #                         'r'))
        #    for k, v in repeats.iteritems():
        #        repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) 
        #                      for _v in v]
            annotations['trf_cons'] = {}
            for seq_name in repeats:
                cons = set([repeat.consensus for repeat in repeats[seq_name]])
                annotations['trf_cons'][seq_name] = cons
    
    if 'hmm' in args.tracks:
        paths = None;
        if args.trf != None and len(args.trf) > 0:
            paths = args.trf
        driver = HMMDriver(paths, args.mathType, model)
        if driver:
            repeats = driver.run(alignment_filename)
            annotations['hmm'] = repeats
            
    perf.msg("Hints computed in {time} seconds.")
    perf.replace()
    return annotations
Пример #5
0
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(inp,
                          alignment_regexp,
                          Alignment,
                          sequence_selectors=sequence_regexp):
        X_index = 0
        Y_index = 1

        X_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[X_index]],
                                           aln.seq_to_aln[X_index]))
        Y_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[Y_index]],
                                           aln.seq_to_aln[Y_index]))

        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([
            x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')
            if x[0] != 'R' and x[1] == 'R'
        ])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [
            i for i, x in zip(range(len(B_ann) + 1),
                              zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M'))
            if x[0] != x[1]
        ]
        R_segments = [(changes[i], changes[i + 1])
                      for i in range(0,
                                     len(changes) - (len(changes) % 2), 2)]

        assert (R_segments_count == len(R_segments))
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert (B_ann[i] == 'R')
            stats[XX + YY] += 1

    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4)
Пример #6
0
from adapters.TRFDriver import TRFDriver, trf_paths
import os
import json
import sys

if len(sys.argv) < 3:
    print "Nedostatok argumentov."
    exit(1)
alignment_filename = sys.argv[1]
output_file = sys.argv[2]

trf = None
for trf_executable in trf_paths:
    if os.path.exists(trf_executable):
        trf = TRFDriver(trf_executable, mathType=float)
if trf:
    repeats = trf.run(alignment_filename)
    for k, v in repeats.iteritems():
        repeats[k] = [(r.start, r.end, r.repetitions, r.consensus, r.sequence) for r in v]
    with open(output_file, 'w') as f:
        json.dump(repeats, f, indent=4)

Пример #7
0
def main(input_file, output_file):

    for trf_executable in trf_paths:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable)
            #break
    if not trf:
        raise "No trf found"
    repeats = trf.run(input_file)

    with open(output_file, 'w') as f:
        for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment):
            if len(alignment.sequences) != 2:
                print 'error'
                continue
            #print alignment.names
            annotation = list('.' * len(alignment.sequences[0]))
            annotationX = list('.' * len(alignment.sequences[0]))
            annotationY = list('.' * len(alignment.sequences[0]))
            trf = None
            for seq_name in alignment.names:
                index = None
                for i in range(len(alignment.names)):
                    if seq_name == alignment.names[i]:
                        index = i
                translator = alignment.seq_to_aln[index]
                revtranslator = alignment.aln_to_seq[index]
                for repeat in repeats[seq_name]:
                    for i in range(translator[repeat.start], translator[repeat.end]):
                        annotation[i] = 'R'
                        j = i - translator[repeat.start]
                        if index == 0:
                            annotationX[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)]
                        else:
                            annotationY[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)]
            d = defaultdict(int)
            ll = 0
            for v in annotation:
                if v != 'R':
                    if ll > 0:
                        d[ll] += 1
                        ll = 0
                else:  
                    ll += 1
            #for x, y in sorted(d.iteritems(), key=lambda x: x[1]):
            #    print '{}: {}'.format(x, y)
            #if len(d.keys()) > 0:
            #    print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format(
            #        sum(d.values()),
            #        sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1),
            #        max(d.keys()),
            #        min(d.keys())
            #    ))

            seqX = alignment.sequences

            nm = alignment.names[0]
            aln = [(alignment.names[0], alignment.sequences[0].replace('.', '-')), 
                   ('consensusX' + nm, ''.join(annotationX)),
                   ('annotation' + nm, ''.join(annotation)),
                   ('consensusY' + nm, ''.join(annotationY)),
                   (alignment.names[1], alignment.sequences[1].replace('.','-'))]
            Fasta.saveAlignmentPiece(aln, f, -1)
Пример #8
0
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(
        inp,
        alignment_regexp,
        Alignment,
        sequence_selectors=sequence_regexp
    ):
        X_index = 0
        Y_index = 1

        X_trf = list(translate_repeat_to_annotation(
            repeats[aln.names[X_index]], aln.seq_to_aln[X_index]))
        Y_trf = list(translate_repeat_to_annotation(
            repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index]))
        
        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
        assert(len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([x for x in zip('M' + ''.join(B_ann),
                                               ''.join(B_ann) + 'M') 
                                if x[0] != 'R' and x[1] == 'R'])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [i 
                   for i, x in zip(
                                   range(len(B_ann) + 1), 
                                   zip('M' + ''.join(B_ann),
                                        ''.join(B_ann) + 'M'))
                   if x[0] != x[1]]
        R_segments = [(changes[i], changes[i+1]) 
                      for i in range(0, len(changes) - (len(changes) % 2), 2)]

        assert(R_segments_count == len(R_segments))  
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert(B_ann[i] == 'R')
            stats[XX + YY] += 1
        
    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4);
Пример #9
0
def main(input_file, output_file):

    for trf_executable in trf_paths:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable)
            #break
    if not trf:
        raise "No trf found"
    repeats = trf.run(input_file)

    with open(output_file, 'w') as f:
        for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment):
            if len(alignment.sequences) != 2:
                print 'error'
                continue
            #print alignment.names
            annotation = list('.' * len(alignment.sequences[0]))
            annotationX = list('.' * len(alignment.sequences[0]))
            annotationY = list('.' * len(alignment.sequences[0]))
            trf = None
            for seq_name in alignment.names:
                index = None
                for i in range(len(alignment.names)):
                    if seq_name == alignment.names[i]:
                        index = i
                translator = alignment.seq_to_aln[index]
                revtranslator = alignment.aln_to_seq[index]
                for repeat in repeats[seq_name]:
                    for i in range(translator[repeat.start],
                                   translator[repeat.end]):
                        annotation[i] = 'R'
                        j = i - translator[repeat.start]
                        if index == 0:
                            annotationX[i] = repeat.consensus[
                                revtranslator[j] % len(repeat.consensus)]
                        else:
                            annotationY[i] = repeat.consensus[
                                revtranslator[j] % len(repeat.consensus)]
            d = defaultdict(int)
            ll = 0
            for v in annotation:
                if v != 'R':
                    if ll > 0:
                        d[ll] += 1
                        ll = 0
                else:
                    ll += 1
            #for x, y in sorted(d.iteritems(), key=lambda x: x[1]):
            #    print '{}: {}'.format(x, y)
            #if len(d.keys()) > 0:
            #    print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format(
            #        sum(d.values()),
            #        sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1),
            #        max(d.keys()),
            #        min(d.keys())
            #    ))

            seqX = alignment.sequences

            nm = alignment.names[0]
            aln = [
                (alignment.names[0], alignment.sequences[0].replace('.', '-')),
                ('consensusX' + nm, ''.join(annotationX)),
                ('annotation' + nm, ''.join(annotation)),
                ('consensusY' + nm, ''.join(annotationY)),
                (alignment.names[1], alignment.sequences[1].replace('.', '-'))
            ]
            Fasta.saveAlignmentPiece(aln, f, -1)
Пример #10
0
from adapters.TRFDriver import TRFDriver, trf_paths
import os
import json
import sys

if len(sys.argv) < 3:
    print "Nedostatok argumentov."
    exit(1)
alignment_filename = sys.argv[1]
output_file = sys.argv[2]

trf = None
for trf_executable in trf_paths:
    if os.path.exists(trf_executable):
        trf = TRFDriver(trf_executable, mathType=float)
if trf:
    repeats = trf.run(alignment_filename)
    for k, v in repeats.iteritems():
        repeats[k] = [(r.start, r.end, r.repetitions, r.consensus, r.sequence) for r in v]
    with open(output_file, "w") as f:
        json.dump(repeats, f, indent=4)