Пример #1
0
def check_duplications(c, blocks, specie):
    l = utils.get_specie_entries(blocks, specie)
    cnt = Counter(map(lambda x: x.block_id, l))
    dups = filter(lambda x: cnt[x.block_id] > 1, c)
    if dups:
        dup_prev = get_previous_entries(dups, c)
        return zip(dup_prev, dups)
    else:
        return []
Пример #2
0
def check_duplications(c, blocks, specie):
    l = utils.get_specie_entries(blocks,specie)
    cnt = Counter(map(lambda x: x.block_id, l))
    dups = filter(lambda x: cnt[x.block_id] > 1, c)
    if dups:
        dup_prev = get_previous_entries(dups,c)
        return zip(dup_prev,dups)
    else:
        return []
Пример #3
0
def run(blocks, print_table=False):
    species = sorted(list(get_set_entries(blocks)))
    threaded_genomes = {}
    for sp in species:
            entries = utils.get_specie_entries(blocks, sp)
            threaded_genomes[sp] = utils.thread_specie_genome(entries)
    index = create_indices(species, threaded_genomes)
    blocks_ids = get_blocks_ids(blocks)
    blocks_num = 0

    features = {}
    if not print_table:
        features = build_features_index(blocks)
    if print_table:
        header = '\t'.join(['breakpoint block']+species)
        print header
    for b in blocks_ids:
        blocks_num += 1
        block_inds = filter(lambda x: x[0] == b, index)
        if not print_table:
            print 'block_id:', b
            for ind in block_inds:
                print index[ind]
        dupls_num, entries_num, neighbours, species_status = build_neighbours(
            block_inds, index)
        process_block_neighborhood(neighbours, block_inds, species_status,
                                   print_table, features, index)
        if not print_table:
            print
        if print_table:
            l = str(b)
            if not species_status.keys():
                # in case breakpoint is caused by assembly incompleteness
                continue
            for e in species:
                if e not in species_status.keys():
                    l += '\t' + 'not-in-block'
                else:
                    l += '\t' + species_status[e]
            print l
    if not print_table:
        print 'STAT number of blocks:', blocks_num
        print 'STAT number of entries:', entries_num
        print 'STAT number of dupls (among entries):', dupls_num
        print 'STAT rate of duplications:', float(dupls_num)/entries_num
    if args.filter:
        f_blocks = utils.filter_bed(blocks, args.filter)
    elif args.classify_breakpoints:
        if args.print_table:
            breakpoints_classifier.run(blocks, True)
        else:
            breakpoints_classifier.run(blocks, False)
        #for k in breakpoints.keys():
        #    k[0].print_out()
        #    k[1].print_out()
        #    print breakpoints[k]
    elif args.report_duplications:
        #TODO: this should be updated because its a rude estimation
        #some blocks should be counted as as one duplication
        for sp in args.species:
            entries = utils.get_specie_entries(blocks, sp)
            entries = utils.thread_specie_genome(entries)
            for c in entries:
                count_dup = 0
                dup = rearrangements_type.check_duplications(c, blocks, sp)
                for e in dup:
                    this_prev = e[0]
                    this_dup = e[1]
                    if not this_prev in map(lambda x: x[1], dup):
                        count_dup += 1
                    print 'duplication:',
                    this_dup.print_out()
                if count_dup:
                    print 'overall duplications', count_dup

    elif args.report_translocations or args.report_transpositions or args.report_reversals\
Пример #5
0
def run(blocks, print_table=False):
    species = sorted(list(get_set_entries(blocks)))
    threaded_genomes = {}
    for sp in species:
            entries = utils.get_specie_entries(blocks, sp)
            threaded_genomes[sp] = utils.thread_specie_genome(entries)
    index = create_indices(species, threaded_genomes)
    blocks_ids = get_blocks_ids(blocks)
    dupls_num = 0
    blocks_num = 0
    entries_num = 0
    if print_table:
        header = '\t'.join(['breakpoint block']+species)
        print header
    for b in blocks_ids:
        blocks_num += 1
        block_inds = filter(lambda x: x[0] == b, index)
        neighbours = []    
        species_status = {}
        if not print_table:
            for ind in block_inds:
                print index[ind]
        #just linearize two-dimensional data
        #[(prev1, next1), (prev2, next2)] -> [prev1, next1, prev2, next2]
        for ind in block_inds:
            entries_num += len(index[ind])
            if len(index[ind]) > 1:
                dupls_num += len(index[ind])
                index[ind] = [(-3,-3)]
                species_status[ind[1]] = 'DUP'
                #TODO solve duplications!!
            #beware of dupl!
            neighb = index[ind][0]    
            neighbours.append(neighb[0])
            neighbours.append(neighb[1])
        #sort by popularity in descending order
        #and leave only non-ending
        c = Counter(neighbours).most_common()
        c = filter(lambda x:x[0] != -2 and x[0] != -1 and x[0] != -3, c)
        #print c
        #if len is less or equal than two (most popular from left and from right), 
        #then breakpoint is likely to be caused by assembly incompleteness
        if len(c) > 2:
            c2 = Counter(dict(c)).most_common()
            if c[2][1] == c[1][1]:
                if not print_table:
                    print 'cant distinguish two most common!'
                    print
                for ind in block_inds:
                    if not ind[1] in species_status.keys():
                        #could not resolve breakpoint
                        species_status[ind[1]] = 'NA'
                continue
            first_common = c[0][0]
            second_common = c[1][0]
            nodef = set([-1,-2])
            allowable = set([-1,-2,first_common,second_common])
            br = False
            for ind in block_inds:
                #beware of dupl!
                prev,next = index[ind][0]
                if prev in nodef and next in nodef:
                    #the whole block is a full scaffold in the specie
                    species_status[ind[1]] = 'END'
                #    print 'possible breakpoint', ind[1], prev, '-', ind[0], '-', next
                    continue
                if prev in allowable and next in allowable:
                    species_status[ind[1]] = '-'
                    continue
                if not prev in allowable:
                    species_status[ind[1]] = 'BR'
                    if not print_table:
                        print 'breakpoint', ind[1], prev, '-', ind[0]
                        br = True
                if not next in allowable:
                    species_status[ind[1]] = 'BR'
                    if not print_table:
                        print 'breakpoint', ind[1], ind[0], '-', next
                        br = True
            #if br:
            #    print
        if not print_table:    
            print
        if print_table:
            l = str(b)
            if not species_status.keys():
                #in case breakpoint is caused by assembly incompleteness
                continue
            for e in species:
                if not e in species_status.keys():
                    l += '\t'+'not in block'
                else:
                    l+='\t'+species_status[e]
            print l
    if not print_table:
        print 'STAT Also:'
        print 'STAT number of blocks:', blocks_num
        print 'STAT number of entries:', entries_num
        print 'STAT number of dupls (among entries):', dupls_num
        print 'STAT rate of duplications:', float(dupls_num)/entries_num
    if args.filter:
        f_blocks = utils.filter_bed(blocks, args.filter)
    elif args.classify_breakpoints:
        if args.print_table:
            breakpoints_classifier.run(blocks, True)
        else:
            breakpoints_classifier.run(blocks, False)
        #for k in breakpoints.keys():
        #    k[0].print_out()
        #    k[1].print_out()
        #    print breakpoints[k]
    elif args.report_duplications:
    #TODO: this should be updated because its a rude estimation
    #some blocks should be counted as as one duplication
            for sp in args.species:
                entries = utils.get_specie_entries(blocks, sp)
                entries = utils.thread_specie_genome(entries)
                for c in entries:
                    count_dup = 0
                    dup = rearrangements_type.check_duplications(c, blocks, sp)
                    for e in dup:
                        this_prev = e[0]
                        this_dup = e[1]
                        if not this_prev in map(lambda x:x[1], dup):
                            count_dup += 1
                        print 'duplication:',
                        this_dup.print_out()
                    if count_dup:
                        print 'overall duplications', count_dup

    elif args.report_translocations or args.report_transpositions or args.report_reversals\