示例#1
0
文件: merge.py 项目: gturco/co-anno
def write_new_bed(gene_list, old_bed, missed_genes,out_file):
    merge_fh = open(out_file,"wb")
    hit_list = [hit for hit,qaccn in missed_genes]
    for i,gene in enumerate(old_bed):
        if gene["accn"] in hit_list: continue
        new_line = Bed.row_string(gene)
        merge_fh.write("{0}\n".format(new_line))
    for i,new_gene in enumerate(gene_list):
        ### merge overlapping here
        updated_feat = gene_list[new_gene]
        if len(updated_feat["locs"]) > 1:
            updated_feat = merge_feats(updated_feat)
        new_line = Bed.row_string(updated_feat)
        merge_fh.write("{0}\n".format(new_line))
示例#2
0
def write_new_bed(gene_list, old_bed, missed_genes, out_file):
    merge_fh = open(out_file, "wb")
    hit_list = [hit for hit, qaccn in missed_genes]
    for i, gene in enumerate(old_bed):
        if gene["accn"] in hit_list: continue
        new_line = Bed.row_string(gene)
        merge_fh.write("{0}\n".format(new_line))
    for i, new_gene in enumerate(gene_list):
        ### merge overlapping here
        updated_feat = gene_list[new_gene]
        if len(updated_feat["locs"]) > 1:
            updated_feat = merge_feats(updated_feat)
        new_line = Bed.row_string(updated_feat)
        merge_fh.write("{0}\n".format(new_line))
def print_bed(flist, old_path):
    ipath, ext = op.splitext(old_path)
    path = "%s.with_new%s" % (ipath, ext)

    print >> sys.stderr, "writing to: %s.with_new%s" % (ipath, ext)
    fh = open(path, 'wb')
    seen = {}

    for item in flist:
        # convert the locs to a tuple.
        #print >>sys.stderr, item
        item = list(item)
        item[6] = tuple(item[6])
        item = tuple(item)
        if item in seen: continue
        seen[item] = 1
        locs = item[6]  # tuple(sorted([item[1], item[2]]))

        row = dict(accn=item[3],
                   start=item[1],
                   end=item[2],
                   seqid=item[0],
                   locs=locs,
                   score='.',
                   strand=item[5],
                   rgb='.',
                   thickstart='.',
                   thickend=".")
        print >> fh, Bed.row_string(row)
    fh.close()
    return Bed(path)
示例#4
0
文件: merge.py 项目: gturco/find_cns
def merge(org_bed, missed, merge_file):
    """creates blast.all file and updates everything"""
    merge_fh = open(merge_file, "w")
    #cds_missed = missed[missed['ftype'] == 'CDS']
    #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0]
    new_rows = []
    seen_accns = {}
    # CDS added to existing gene.
    for row_missed in missed:
        if row_missed['accn'] in seen_accns: continue
        try:
            org_bed_row = org_bed.accn(row_missed['accn'])
             # it's a CDS
        except KeyError:
            #its a new gene
            new_rows.append(row_missed)
            seen_accns[row_missed['accn']] = True
            continue
        locs_interval = Intersecter()
        [locs_interval.add_interval(Feature(start,stop)) for start,stop in org_bed_row['locs']]
        for missed_start,missed_end in row_missed['locs']:
            if len(locs_interval.find(missed_start,missed_end)) > 0:
#                print >>sys.stderr, org_bed_row['accn']
                locs_intersects = [(l.start,l.stop) for l in locs_interval.find(missed_start,missed_end)]
                [org_bed_row['locs'].remove(locs_intersect) for locs_intersect in locs_intersects]
                locs_intersects = set(locs_intersects)
		locs_intersects.add((missed_start,missed_end))
                locs_start = min([start for start,end in locs_intersects])
                locs_end = max([end for start,end in locs_intersects])
                org_bed_row['locs'] = org_bed_row['locs'] + [(locs_start,locs_end)]
                row_missed['locs'].remove((missed_start,missed_end))

        org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs']
        #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs)
        org_bed_row['locs'].sort()
        org_bed_row['start'] = min(min([start for start,end in org_bed_row['locs']]), org_bed_row['start'])
        org_bed_row['end'] = max(max([end for start,end in org_bed_row['locs']]), org_bed_row['end'])
        new_rows.append(org_bed_row)
        seen_accns[org_bed_row['accn']] =True

    for org_bed_rw in org_bed:
        if org_bed_rw['accn'] not in seen_accns:
            new_rows.append(org_bed_rw)
            seen_accns[org_bed_rw['accn']] =True

    def row_cmp(a,b):
        return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start'])


    new_rows.sort(cmp=row_cmp)
    #print >>merge_fh, "\t".join(Bed.names)
    for i, row in enumerate(new_rows):
        print >>merge_fh, Bed.row_string(row)
示例#5
0
文件: merge.py 项目: gturco/find_cns
def merge_same_hits(missed, fh_match, org_bed):
    """ groups genes that hit more then once """
    d = {}
    handle = open(fh_match)
    matches = handle.read()
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    fh = open('{0}/missed_from_{1}'.format(dirc,org), "wb")
    for match in matches.split('\n')[:-1]:
        qaccn,saccn = match.split('\t')
        #create dictionary
        try:
            seqid = missed.accn(qaccn)['seqid']
            haccn = missed.accn(qaccn)
        except KeyError: continue
        #if near_gene(haccn,org_bed)==True: continue
        if (seqid,saccn) not in d.keys():
            #append whole dict to keys
            d[(seqid,saccn)]= missed.accn(qaccn)
        else:
            #else add locs to exsting one
            gene_start = min(d[(seqid,saccn)]['locs'])[0]
            gene_end = max(d[(seqid,saccn)]['locs'])[1]
            missed_end = missed.accn(qaccn)['locs'][0][1]
            missed_start = missed.accn(qaccn)['locs'][0][0]
            if missed_end < gene_start:
                # if no intervening genes and they are close together...
                intervening_genes = get_intervening_genes(missed_end,gene_start,seqid, org_bed, d[(seqid,saccn)]['accn'])
                if intervening_genes is False:
                    d[(seqid,saccn)]['locs'] =  d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid,saccn)]['start'] = missed_start
                    if 'Os' in qaccn:
		    	        d[seqid,saccn]['accn'] = qaccn
                else:
                    d[(seqid,qaccn)] = missed.accn(qaccn)
            elif gene_end < missed_start:
                intervening_genes = get_intervening_genes(gene_end,missed_start,seqid, org_bed,d[(seqid,saccn)]["accn"])
                if intervening_genes is False:
                    d[(seqid,saccn)]['locs'] =  d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid,saccn)]['end'] = missed_end
                    if 'Os' in qaccn:
                        d[seqid,saccn]['accn'] = qaccn
                else:
                    d[(seqid,qaccn)]= missed.accn(qaccn)
            else:
                d[(seqid,saccn)]['locs'] =  d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs']
        
    for key in d.keys():
        new_row = d[key]['locs'].sort()
        row = d[key]
        print >>fh, Bed.row_string(row)
示例#6
0
def merge_flat(new_name, aflat, bflat):
    """take 2 flat files and return a new one that is the union of the 2
      existing"""
    seen = {}
    both = []
    for flat in (aflat, bflat):
        for row in flat:
            key = row['seqid'], row['accn']
            if key in seen: continue
            seen[key] = True
            both.append(row)
            both.sort(key=lambda a: (a['seqid'], a['start']))
    fh = open(new_name, "w")
    #print >>fh, "\t".join(Flat.names)
    for b in both:
        print >> fh, Bed.row_string(b)
    fh.close()
    return Bed(fh.name)
示例#7
0
def merge_flat(new_name, aflat, bflat):
    """take 2 flat files and return a new one that is the union of the 2
      existing"""
    seen = {}
    both = []
    for flat in (aflat, bflat):
        for row in flat:
            key = row['seqid'], row['accn']
            if key in seen: continue
            seen[key] = True
            both.append(row)
            both.sort(key=lambda a: (a['seqid'],a['start']))
    fh = open(new_name, "w")
    #print >>fh, "\t".join(Flat.names)
    for b in both:
        print >>fh, Bed.row_string(b)
    fh.close()
    return Bed(fh.name)
示例#8
0
def print_bed(flist, old_path):
    ipath, ext = op.splitext(old_path)
    path = "%s.with_new%s" % (ipath, ext)

    print >>sys.stderr,  "writing to: %s.with_new%s" % (ipath, ext)
    fh = open(path, 'wb')
    seen = {}

    for item in flist:
        # convert the locs to a tuple.
        #print >>sys.stderr, item
        item = list(item)
        item[6] = tuple(item[6])
        item = tuple(item)
        if item in seen: continue
        seen[item] = 1
        locs = item[6] # tuple(sorted([item[1], item[2]]))

        row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0],
                   locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".")
        print >>fh, Bed.row_string(row)
    fh.close()
    return Bed(path)
示例#9
0
def merge(org_bed, missed, merge_file):
    """creates blast.all file and updates everything"""
    merge_fh = open(merge_file, "w")
    #cds_missed = missed[missed['ftype'] == 'CDS']
    #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0]
    new_rows = []
    seen_accns = {}
    # CDS added to existing gene.
    for row_missed in missed:
        if row_missed['accn'] in seen_accns: continue
        try:
            org_bed_row = org_bed.accn(row_missed['accn'])
            # it's a CDS
        except KeyError:
            #its a new gene
            new_rows.append(row_missed)
            seen_accns[row_missed['accn']] = True
            continue
        locs_interval = Intersecter()
        [
            locs_interval.add_interval(Feature(start, stop))
            for start, stop in org_bed_row['locs']
        ]
        for missed_start, missed_end in row_missed['locs']:
            if len(locs_interval.find(missed_start, missed_end)) > 0:
                #                print >>sys.stderr, org_bed_row['accn']
                locs_intersects = [
                    (l.start, l.stop)
                    for l in locs_interval.find(missed_start, missed_end)
                ]
                [
                    org_bed_row['locs'].remove(locs_intersect)
                    for locs_intersect in locs_intersects
                ]
                locs_intersects = set(locs_intersects)
                locs_intersects.add((missed_start, missed_end))
                locs_start = min([start for start, end in locs_intersects])
                locs_end = max([end for start, end in locs_intersects])
                org_bed_row['locs'] = org_bed_row['locs'] + [
                    (locs_start, locs_end)
                ]
                row_missed['locs'].remove((missed_start, missed_end))

        org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs']
        #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs)
        org_bed_row['locs'].sort()
        org_bed_row['start'] = min(
            min([start for start, end in org_bed_row['locs']]),
            org_bed_row['start'])
        org_bed_row['end'] = max(
            max([end for start, end in org_bed_row['locs']]),
            org_bed_row['end'])
        new_rows.append(org_bed_row)
        seen_accns[org_bed_row['accn']] = True

    for org_bed_rw in org_bed:
        if org_bed_rw['accn'] not in seen_accns:
            new_rows.append(org_bed_rw)
            seen_accns[org_bed_rw['accn']] = True

    def row_cmp(a, b):
        return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start'])

    new_rows.sort(cmp=row_cmp)
    #print >>merge_fh, "\t".join(Bed.names)
    for i, row in enumerate(new_rows):
        print >> merge_fh, Bed.row_string(row)
示例#10
0
def merge_same_hits(missed, fh_match, org_bed):
    """ groups genes that hit more then once """
    d = {}
    handle = open(fh_match)
    matches = handle.read()
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    fh = open('{0}/missed_from_{1}'.format(dirc, org), "wb")
    for match in matches.split('\n')[:-1]:
        qaccn, saccn = match.split('\t')
        #create dictionary
        try:
            seqid = missed.accn(qaccn)['seqid']
            haccn = missed.accn(qaccn)
        except KeyError:
            continue
        #if near_gene(haccn,org_bed)==True: continue
        if (seqid, saccn) not in d.keys():
            #append whole dict to keys
            d[(seqid, saccn)] = missed.accn(qaccn)
        else:
            #else add locs to exsting one
            gene_start = min(d[(seqid, saccn)]['locs'])[0]
            gene_end = max(d[(seqid, saccn)]['locs'])[1]
            missed_end = missed.accn(qaccn)['locs'][0][1]
            missed_start = missed.accn(qaccn)['locs'][0][0]
            if missed_end < gene_start:
                # if no intervening genes and they are close together...
                intervening_genes = get_intervening_genes(
                    missed_end, gene_start, seqid, org_bed,
                    d[(seqid, saccn)]['accn'])
                if intervening_genes is False:
                    d[(seqid, saccn)]['locs'] = d[
                        (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid, saccn)]['start'] = missed_start
                    if 'Os' in qaccn:
                        d[seqid, saccn]['accn'] = qaccn
                else:
                    d[(seqid, qaccn)] = missed.accn(qaccn)
            elif gene_end < missed_start:
                intervening_genes = get_intervening_genes(
                    gene_end, missed_start, seqid, org_bed,
                    d[(seqid, saccn)]["accn"])
                if intervening_genes is False:
                    d[(seqid, saccn)]['locs'] = d[
                        (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs']
                    d[(seqid, saccn)]['end'] = missed_end
                    if 'Os' in qaccn:
                        d[seqid, saccn]['accn'] = qaccn
                else:
                    d[(seqid, qaccn)] = missed.accn(qaccn)
            else:
                d[(seqid, saccn)]['locs'] = d[
                    (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs']

    for key in d.keys():
        new_row = d[key]['locs'].sort()
        row = d[key]
        print >> fh, Bed.row_string(row)
示例#11
0
def write_bed(gene,merge_fh):
     new_line = Bed.row_string(gene)
     merge_fh.write("{0}\n".format(new_line))
示例#12
0
def write_bed(gene, merge_fh):
    new_line = Bed.row_string(gene)
    merge_fh.write("{0}\n".format(new_line))