def run(blocks, print_table=False): species = sorted(list(get_set_entries(blocks))) threaded_genomes = {} for sp in species: entries = utils.get_specie_entries(blocks, sp) threaded_genomes[sp] = utils.thread_specie_genome(entries) index = create_indices(species, threaded_genomes) blocks_ids = get_blocks_ids(blocks) blocks_num = 0 features = {} if not print_table: features = build_features_index(blocks) if print_table: header = '\t'.join(['breakpoint block']+species) print header for b in blocks_ids: blocks_num += 1 block_inds = filter(lambda x: x[0] == b, index) if not print_table: print 'block_id:', b for ind in block_inds: print index[ind] dupls_num, entries_num, neighbours, species_status = build_neighbours( block_inds, index) process_block_neighborhood(neighbours, block_inds, species_status, print_table, features, index) if not print_table: print if print_table: l = str(b) if not species_status.keys(): # in case breakpoint is caused by assembly incompleteness continue for e in species: if e not in species_status.keys(): l += '\t' + 'not-in-block' else: l += '\t' + species_status[e] print l if not print_table: print 'STAT number of blocks:', blocks_num print 'STAT number of entries:', entries_num print 'STAT number of dupls (among entries):', dupls_num print 'STAT rate of duplications:', float(dupls_num)/entries_num
f_blocks = utils.filter_bed(blocks, args.filter) elif args.classify_breakpoints: if args.print_table: breakpoints_classifier.run(blocks, True) else: breakpoints_classifier.run(blocks, False) #for k in breakpoints.keys(): # k[0].print_out() # k[1].print_out() # print breakpoints[k] elif args.report_duplications: #TODO: this should be updated because its a rude estimation #some blocks should be counted as as one duplication for sp in args.species: entries = utils.get_specie_entries(blocks, sp) entries = utils.thread_specie_genome(entries) for c in entries: count_dup = 0 dup = rearrangements_type.check_duplications(c, blocks, sp) for e in dup: this_prev = e[0] this_dup = e[1] if not this_prev in map(lambda x: x[1], dup): count_dup += 1 print 'duplication:', this_dup.print_out() if count_dup: print 'overall duplications', count_dup elif args.report_translocations or args.report_transpositions or args.report_reversals\ or args.report_duplications:
def run(blocks, print_table=False): species = sorted(list(get_set_entries(blocks))) threaded_genomes = {} for sp in species: entries = utils.get_specie_entries(blocks, sp) threaded_genomes[sp] = utils.thread_specie_genome(entries) index = create_indices(species, threaded_genomes) blocks_ids = get_blocks_ids(blocks) dupls_num = 0 blocks_num = 0 entries_num = 0 if print_table: header = '\t'.join(['breakpoint block']+species) print header for b in blocks_ids: blocks_num += 1 block_inds = filter(lambda x: x[0] == b, index) neighbours = [] species_status = {} if not print_table: for ind in block_inds: print index[ind] #just linearize two-dimensional data #[(prev1, next1), (prev2, next2)] -> [prev1, next1, prev2, next2] for ind in block_inds: entries_num += len(index[ind]) if len(index[ind]) > 1: dupls_num += len(index[ind]) index[ind] = [(-3,-3)] species_status[ind[1]] = 'DUP' #TODO solve duplications!! #beware of dupl! neighb = index[ind][0] neighbours.append(neighb[0]) neighbours.append(neighb[1]) #sort by popularity in descending order #and leave only non-ending c = Counter(neighbours).most_common() c = filter(lambda x:x[0] != -2 and x[0] != -1 and x[0] != -3, c) #print c #if len is less or equal than two (most popular from left and from right), #then breakpoint is likely to be caused by assembly incompleteness if len(c) > 2: c2 = Counter(dict(c)).most_common() if c[2][1] == c[1][1]: if not print_table: print 'cant distinguish two most common!' print for ind in block_inds: if not ind[1] in species_status.keys(): #could not resolve breakpoint species_status[ind[1]] = 'NA' continue first_common = c[0][0] second_common = c[1][0] nodef = set([-1,-2]) allowable = set([-1,-2,first_common,second_common]) br = False for ind in block_inds: #beware of dupl! prev,next = index[ind][0] if prev in nodef and next in nodef: #the whole block is a full scaffold in the specie species_status[ind[1]] = 'END' # print 'possible breakpoint', ind[1], prev, '-', ind[0], '-', next continue if prev in allowable and next in allowable: species_status[ind[1]] = '-' continue if not prev in allowable: species_status[ind[1]] = 'BR' if not print_table: print 'breakpoint', ind[1], prev, '-', ind[0] br = True if not next in allowable: species_status[ind[1]] = 'BR' if not print_table: print 'breakpoint', ind[1], ind[0], '-', next br = True #if br: # print if not print_table: print if print_table: l = str(b) if not species_status.keys(): #in case breakpoint is caused by assembly incompleteness continue for e in species: if not e in species_status.keys(): l += '\t'+'not in block' else: l+='\t'+species_status[e] print l if not print_table: print 'STAT Also:' print 'STAT number of blocks:', blocks_num print 'STAT number of entries:', entries_num print 'STAT number of dupls (among entries):', dupls_num print 'STAT rate of duplications:', float(dupls_num)/entries_num
f_blocks = utils.filter_bed(blocks, args.filter) elif args.classify_breakpoints: if args.print_table: breakpoints_classifier.run(blocks, True) else: breakpoints_classifier.run(blocks, False) #for k in breakpoints.keys(): # k[0].print_out() # k[1].print_out() # print breakpoints[k] elif args.report_duplications: #TODO: this should be updated because its a rude estimation #some blocks should be counted as as one duplication for sp in args.species: entries = utils.get_specie_entries(blocks, sp) entries = utils.thread_specie_genome(entries) for c in entries: count_dup = 0 dup = rearrangements_type.check_duplications(c, blocks, sp) for e in dup: this_prev = e[0] this_dup = e[1] if not this_prev in map(lambda x:x[1], dup): count_dup += 1 print 'duplication:', this_dup.print_out() if count_dup: print 'overall duplications', count_dup elif args.report_translocations or args.report_transpositions or args.report_reversals\ or args.report_duplications: