def compute_annotations(args, alignment_filename, model): annotations = dict() if 'trf' in args.tracks: trf = None for trf_executable in args.trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=args.mathType) #break if trf: repeats = trf.run(alignment_filename) annotations['trf'] = repeats if 'original_repeats' in args.tracks: repeats = json.load(Open(alignment_filename + '.repeats', 'r')) for k, v in repeats.iteritems(): repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) for _v in v] annotations['original_repeats'] = repeats if 'trf_cons' in args.tracks: trf = None for trf_executable in args.trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=args.mathType) #break if trf: repeats = trf.run(alignment_filename) # repeats = json.load(Open(alignment_filename + '.repeats', # 'r')) # for k, v in repeats.iteritems(): # repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) # for _v in v] annotations['trf_cons'] = {} for seq_name in repeats: cons = set([repeat.consensus for repeat in repeats[seq_name]]) annotations['trf_cons'][seq_name] = cons if 'hmm' in args.tracks: paths = None; if args.trf != None and len(args.trf) > 0: paths = args.trf driver = HMMDriver(paths, args.mathType, model) if driver: repeats = driver.run(alignment_filename) annotations['hmm'] = repeats perf.msg("Hints computed in {time} seconds.") perf.replace() return annotations
def do_find_repeats(alignment_file, paramSeq, model, mathType, _stats, separator): modelParam = { "mathType": mathType, "modelFactory": model, } driver = TRFDriver() trf_repeats = driver.run(alignment_file, paramSeq) alignments = Fasta.load(alignment_file, separator, Alignment) D = dict() stats = defaultdict(int) count = 1; for alignment in alignments: print ("Annotating alignment {0}".format(count)) count += 1 consensus_list = list(set([ x.consensus for x in itertools.chain(*[ trf_repeats[name] for name in alignment.names ]) ])) repeats = find_repeats_in_alignment(alignment, consensus_list, modelParam) print repeats D.update(repeats) if _stats != None: s = compute_statistics(repeats) for key, value in s.iteritems(): stats[key] += value return D, stats
def main(input_file, output_file, trf): # THIS IS ONLY GENERATOR!!! alns = (Alignment(a) for a in Fasta.load(input_file, '[.][0-9]+$', Alignment)) # 1. run trf, for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) break repeats = trf.run(input_file) A = list(compute_annotation_track(alns, repeats)) json.dump(A, Open(output_file, 'w'), indent=4)
def main(files, trf, alignment_regexp, sequence_regexp): output_files = { 'emission': [], 'transition': [], 'trf_length': [], 'trf_consensus': [], 'trf_fulllength': [], 'trf_cover': [], } for filename in files: # AggregateAnnotation em_file = filename + '.emission.stat' tr_file = filename + '.transition.stat' le_file = filename + '.trf_length.stat' co_file = filename + '.trf_consensus.stat' lef_file = filename + '.trf_fulllength.stat' cover_file = filename + '.trf_cover.stat' AggregateAnnotation( filename, 0, 1, em_file, tr_file ) # Run TRF TRF = TRFDriver(trf) trf_output_filename = TRF.run(filename, dont_parse=True) # Aggregate TRF output AggregateTRFOutput( trf_output_filename, le_file, co_file, lef_file, ) TrfCover(filename, cover_file, alignment_regexp, sequence_regexp, trf) output_files['emission'].append(em_file) output_files['transition'].append(tr_file) output_files['trf_length'].append(le_file) output_files['trf_consensus'].append(co_file) output_files['trf_fulllength'].append(co_file) output_files['trf_cover'].append(cover_file) return output_files
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths): for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) break repeats = trf.run(inp) stats = defaultdict(int) for aln in Fasta.load(inp, alignment_regexp, Alignment, sequence_selectors=sequence_regexp): X_index = 0 Y_index = 1 X_trf = list( translate_repeat_to_annotation(repeats[aln.names[X_index]], aln.seq_to_aln[X_index])) Y_trf = list( translate_repeat_to_annotation(repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index])) X_ann = list("M" * len(aln.sequences[X_index])) Y_ann = list("M" * len(aln.sequences[Y_index])) B_ann = list("M" * len(aln.sequences[Y_index])) for repeat in X_trf: if repeat.end >= len(X_ann): repeat.end = len(X_ann) - 1 rlen = 1 + repeat.end - repeat.start X_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) for repeat in Y_trf: if repeat.end >= len(Y_ann): repeat.end = len(Y_ann) - 1 rlen = 1 + repeat.end - repeat.start Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann)) M_count = len([x for x in B_ann if x == 'M']) R_count = len([x for x in B_ann if x == 'R']) R_segments_count = len([ x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M') if x[0] != 'R' and x[1] == 'R' ]) stats['M_count'] += M_count stats['R_count'] += R_count stats['R_segment_count'] += R_segments_count changes = [ i for i, x in zip(range(len(B_ann) + 1), zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')) if x[0] != x[1] ] R_segments = [(changes[i], changes[i + 1]) for i in range(0, len(changes) - (len(changes) % 2), 2)] assert (R_segments_count == len(R_segments)) for start, stop in R_segments: XX = 'M' YY = 'M' for i in range(start, stop): if X_ann[i] == 'R': XX = 'R' if Y_ann[i] == 'R': YY = 'R' assert (B_ann[i] == 'R') stats[XX + YY] += 1 with Open(out, 'w') as f: json.dump(stats, f, indent=4)
from adapters.TRFDriver import TRFDriver, trf_paths import os import json import sys if len(sys.argv) < 3: print "Nedostatok argumentov." exit(1) alignment_filename = sys.argv[1] output_file = sys.argv[2] trf = None for trf_executable in trf_paths: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) if trf: repeats = trf.run(alignment_filename) for k, v in repeats.iteritems(): repeats[k] = [(r.start, r.end, r.repetitions, r.consensus, r.sequence) for r in v] with open(output_file, 'w') as f: json.dump(repeats, f, indent=4)
def main(input_file, output_file): for trf_executable in trf_paths: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) #break if not trf: raise "No trf found" repeats = trf.run(input_file) with open(output_file, 'w') as f: for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment): if len(alignment.sequences) != 2: print 'error' continue #print alignment.names annotation = list('.' * len(alignment.sequences[0])) annotationX = list('.' * len(alignment.sequences[0])) annotationY = list('.' * len(alignment.sequences[0])) trf = None for seq_name in alignment.names: index = None for i in range(len(alignment.names)): if seq_name == alignment.names[i]: index = i translator = alignment.seq_to_aln[index] revtranslator = alignment.aln_to_seq[index] for repeat in repeats[seq_name]: for i in range(translator[repeat.start], translator[repeat.end]): annotation[i] = 'R' j = i - translator[repeat.start] if index == 0: annotationX[i] = repeat.consensus[ revtranslator[j] % len(repeat.consensus)] else: annotationY[i] = repeat.consensus[ revtranslator[j] % len(repeat.consensus)] d = defaultdict(int) ll = 0 for v in annotation: if v != 'R': if ll > 0: d[ll] += 1 ll = 0 else: ll += 1 #for x, y in sorted(d.iteritems(), key=lambda x: x[1]): # print '{}: {}'.format(x, y) #if len(d.keys()) > 0: # print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format( # sum(d.values()), # sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1), # max(d.keys()), # min(d.keys()) # )) seqX = alignment.sequences nm = alignment.names[0] aln = [ (alignment.names[0], alignment.sequences[0].replace('.', '-')), ('consensusX' + nm, ''.join(annotationX)), ('annotation' + nm, ''.join(annotation)), ('consensusY' + nm, ''.join(annotationY)), (alignment.names[1], alignment.sequences[1].replace('.', '-')) ] Fasta.saveAlignmentPiece(aln, f, -1)