Exemplo n.º 1
0
def find_closest(peak, starts_plus, stops_minus, cds_dict, ann_dict, d_threshold, inside, if_bed):
    center = int(peak.name)
    distances = [('+', x[0], x[1]-center) for x in starts_plus]
    distances.extend([('-', x[0], center-x[1]+1) for x in stops_minus]);
    distances = [x for x in distances if x[2]>-1*inside]
    strand, gene_name, mindistance = min(distances, key = lambda x: abs(x[2]))
    #print(gene_name)
    if(abs(mindistance) <= d_threshold):
        #gene_name = "_".join(gene_name.split("_")[:-1])
        cds = cds_dict[gene_name]
        if(strand == '+'):
            atg = cds.start - center
        if(strand == '-'):
            atg = center - cds.end + 1
    
        gene_name = gene_name.split("-")[1]
        ann = ann_dict.get(gene_name);
        if(ann):
            genesymbol = ann[0]
            annotation = ann[1]
            function = ann[2]
        else:
            genesymbol = gene_name
            annotation = 'Unknown'
            function = "Unknown"
        
        if(if_bed):
            return construct_gff_interval(peak.chrom, peak.start, peak.stop, 'consensus_region', score='0', strand=strand, source='un', frame='.', attrs=[("Name", peak.name), ("annotation", annotation), ("function", function), ("gene", gene_name), ("genesymbol", genesymbol), ("tss", mindistance), ("atg", abs(atg))])
        else:
            return construct_gff_interval(peak.chrom, peak.start, peak.stop, 'consensus_region', score='0', strand=strand, source='un', frame='.', attrs=[("Name", peak.attrs['Name']), ("maxcov", peak.attrs['maxcov']), ("zscores", peak.attrs['zscores']), ("peakpos", peak.attrs['peakpos']), ("annotation", annotation), ("function", function), ("gene", gene_name), ("genesymbol", genesymbol), ("tss", mindistance), ("atg", abs(atg))])
    else:
        return None
Exemplo n.º 2
0
def print_compiled(compiled, size):
    temp_d = dict(compiled)
    compiled_processed = [temp_d.get(x, None) for x in range(size)]

    area_coverage = ",".join(
        [x.attrs['area_coverage'] if x else '0' for x in compiled_processed])
    topcoverage = ",".join(
        [x.attrs['topcoverage'] if x else '0' for x in compiled_processed])
    compiled_processed = [x for x in compiled_processed if x]

    compiled = [x[1] for x in sorted(compiled, key=lambda x: x[0])]
    pos = int(
        sum([
            int(x.name) * float(x.attrs['topcoverage'])
            for x in compiled_processed
        ]) / sum([float(x.attrs['topcoverage']) for x in compiled_processed]))
    start = min([x.start for x in compiled_processed])
    stop = min([x.stop for x in compiled_processed])

    consensus = construct_gff_interval(compiled[0].chrom,
                                       start,
                                       stop,
                                       'consensus',
                                       score='0',
                                       strand='.',
                                       source='.',
                                       frame='.',
                                       attrs=[('Name', pos),
                                              ('topcoverage', topcoverage),
                                              ('area_coverage', area_coverage)
                                              ])

    sys.stdout.write(str(consensus))
Exemplo n.º 3
0
def annotate_position(peak, tr_dict, maxd, inside):
    if(peak.chrom not in tr_dict):
        return None
    
    center = int(peak.name)
    tr_plus, tr_minus = tr_dict[peak.chrom]
    if(peak.strand == '+'):
        distances = [(tr, center-tr.stop+1) for tr in tr_minus];
    else:
        distances = [(tr, tr.start-center) for tr in tr_plus]
    
    distances = [x for x in distances if x[1]>-1*inside]
    if(not distances):
        return None
    transcript, mindistance = min(distances, key = lambda x: abs(x[1]))

    if(abs(mindistance) <= maxd):
        gtype = 'upstream'
        atg = mindistance + float(transcript.attrs['distance'])
        old_attrs = peak.attrs
        new_attrs = [("annotation", transcript.attrs['annotation']), ("function", transcript.attrs['function']), ("gene", transcript.name), ("genesymbol", transcript.attrs['genesymbol']), ("cg", transcript.attrs.get('cg', 'unknown')), ("tss", mindistance), ("atg", atg), ("gtype", gtype), ("anti", "1")]
        for k, v in new_attrs:
            old_attrs[k] = v;
        

        return construct_gff_interval( peak.chrom, peak.start, peak.stop, 'annotated', score=peak.score, strand=transcript.strand, source='annotate.py', frame='.', attrs=old_attrs.items())
    
    else:
        return None
Exemplo n.º 4
0
def annotate_position(peak, tr_dict, maxd, inside):
    center = int(peak.name)
    if (peak.chrom not in tr_dict):
        mindistance = float("NaN")
        gtype = "intergenic"
        transcript = STUB_TR

    else:
        center = int(peak.name)
        tr_plus, tr_minus = tr_dict[peak.chrom]
        distances = [(tr, tr.start - center) for tr in tr_plus]
        distances.extend([(tr, center - tr.stop + 1) for tr in tr_minus])
        distances = [x for x in distances if x[1] > -1 * inside]
        transcript, mindistance = min(distances, key=lambda x: abs(x[1]))

        if (abs(mindistance) <= maxd):
            gtype = 'upstream'
        else:
            pairs = [(tr, center - tr.start, tr.stop - center - 1)
                     for tr in tr_plus + tr_minus]
            pairs = [x for x in pairs if x[1] >= 0 and x[2] >= 0]
            if (pairs):
                gtype = 'gene'
                transcript = pairs[0][0]
                if (transcript.strand == '+'):
                    mindistance = -1 * pairs[0][1]
                else:
                    mindistance = -1 * pairs[0][2]
            else:
                gtype = 'intergenic'

    atg = mindistance + float(transcript.attrs['distance'])
    if (str(atg) != "nan"):
        atg = "%d" % atg

    attrs = [("Name", peak.name),
             ("annotation", transcript.attrs['annotation']),
             ("function", transcript.attrs['function']),
             ("gene", transcript.name),
             ("genesymbol", transcript.attrs['genesymbol']),
             ("cg", transcript.attrs['cg']), ("tss", mindistance),
             ("atg", atg), ("gtype", gtype)]

    if (if_bed):
        return construct_gff_interval(peak.chrom,
                                      peak.start,
                                      peak.stop,
                                      'annotated',
                                      score=peak.score,
                                      strand=transcript.strand,
                                      source='annotate.py',
                                      frame='.',
                                      attrs=attrs)
    else:
        for attr_name, attr_value in attrs:
            peak.attrs[attr_name] = str(attr_value)
        return peak
Exemplo n.º 5
0
def merge(intervals, name):
    chrom = intervals[0].chrom
    start = min([int(x.start) for x in intervals])
    stop = max([int(x.stop) for x in intervals])
    score = str(max([float(x.score) for x in intervals]))
    strand = intervals[0].strand

    gap = str(min([int(x.attrs['gap']) for x in intervals], key = abs))
    unique_reads = sum([int(x.attrs.get('n_uniq', 1)) for x in intervals])
    chscore = str(max([x.attrs['chscore'] for x in intervals]))

    return construct_gff_interval(chrom, start, stop, 'interaction', score=score, strand=strand, source='chiflex', frame='.', attrs=[("ID", name), ('gap', gap), ('chscore', chscore), ('n_uniq', unique_reads)])
Exemplo n.º 6
0
def find_closest(peak, starts_plus, stops_minus, pam_plus, pam_minus, d_threshold, inside):
    center = int(peak.name)
    distances = [('+', x[0], x[1]-center) for x in starts_plus]
    distances.extend([('-', x[0], center-x[1]+1) for x in stops_minus]);
    distances = [x for x in distances if x[2]>-1*inside]
    strand, gene_name, mindistance = min(distances, key = lambda x: abs(x[2]))
    if(abs(mindistance) <= d_threshold):
        pam_sense, pam_antisense = get_pams_around_peak(center, strand, pam_plus, pam_minus)
        pam_min = min([abs(x) for x in [pam_sense, pam_antisense]])
        
        return construct_gff_interval( peak.chrom, peak.start, peak.stop, 'consensus_region', score=str(peak.score), strand=strand, source='un', frame='.', attrs=[("Name", peak.name),("tss", mindistance), ("gene", gene_name), ('pam_sense', pam_sense), ('pam_antisense', pam_antisense), ('pam_min', pam_min) ])
    else:
        return False
Exemplo n.º 7
0
def area2interval(area, pflank, clear):
    chrom = area[0][4]
    start = area[0][0]
    stop = area[-1][0]+1
    score = str(max([x[3] for x in area]))
    distance = int(abs((np.mean([x[2] for x in area]))))
    peak = area[0][1]
    if(distance<pflank):
        _type = 'in'
    elif(distance<clear):
        _type = 'unk'
    else:
        _type = 'out'
    
    return construct_gff_interval(chrom, start, stop, 'long_at', score=score, strand='.', source='un', frame='.', attrs=[("peak", peak), ("distance", distance), ("type", _type), ("ID", str((stop+start)//2))])
Exemplo n.º 8
0
def annotate_position(peak, tr_dict, maxd, inside):
    #sys.stderr.write(str(peak))
    center = int(peak.name)
    tr_plus, tr_minus = tr_dict.get(peak.chrom, [None, None])
    if (not tr_plus):
        mindistance = float("NaN")
        gtype = "intergenic"
        transcript = STUB_TR
        anti = get_anti(STUB_TR, 'NaN')

    else:
        distances_plus = [(tr, tr.start - center) for tr in tr_plus]
        distances_minus = [(tr, center - tr.stop + 1) for tr in tr_minus]

        distances_plus = [x for x in distances_plus if x[1] > -1 * inside]
        distances_minus = [x for x in distances_minus if x[1] > -1 * inside]

        if (distances_plus):
            transcript_plus, mindistance_plus = min(distances_plus,
                                                    key=lambda x: abs(x[1]))
        else:
            transcript_plus, mindistance_plus = STUB_TR, float("inf")

        if (distances_minus):
            transcript_minus, mindistance_minus = min(distances_minus,
                                                      key=lambda x: abs(x[1]))
        else:
            transcript_minus, mindistance_minus = STUB_TR, float("inf")

        if (mindistance_plus < mindistance_minus):
            transcript, mindistance = transcript_plus, mindistance_plus
            anti = get_anti(transcript_minus, mindistance_minus)
        else:
            transcript, mindistance = transcript_minus, mindistance_minus
            anti = get_anti(transcript_plus, mindistance_plus)

        if (abs(mindistance) <= maxd):
            gtype = 'upstream'
        else:
            pairs = [(tr, center - tr.start, tr.stop - center - 1)
                     for tr in tr_plus + tr_minus]
            pairs = [x for x in pairs if x[1] >= 0 and x[2] >= 0]
            if (pairs):
                gtype = 'gene'
                transcript = pairs[0][0]
                if (transcript.strand == '+'):
                    mindistance = -1 * pairs[0][1]
                else:
                    mindistance = -1 * pairs[0][2]
            else:
                gtype = 'intergenic'

    atg = mindistance + float(transcript.attrs['distance'])
    if (str(atg) != "nan"):
        atg = "%d" % atg

    attrs = [("Name", peak.name),
             ("annotation", transcript.attrs['annotation']),
             ("function", transcript.attrs['function']),
             ("gene", transcript.name),
             ("genesymbol", transcript.attrs['genesymbol']),
             ("cg", transcript.attrs.get('cg', 'unknown')),
             ("tss", mindistance), ("atg", atg), ("gtype", gtype)] + anti

    if (if_bed):
        return construct_gff_interval(peak.chrom,
                                      peak.start,
                                      peak.stop,
                                      'annotated',
                                      score=peak.score,
                                      strand=transcript.strand,
                                      source='annotate.py',
                                      frame='.',
                                      attrs=attrs)
    else:
        for attr_name, attr_value in attrs:
            peak.attrs[attr_name] = str(attr_value)
        peak.strand = transcript.strand
        return peak
Exemplo n.º 9
0
                    nargs='?',
                    required=False,
                    type=str,
                    help="Path to the output plot directory")
args = parser.parse_args()

if (os.stat(args.path).st_size == 0):
    sys.exit("###annotate\nInput file is empty, empty output is produced\n")

STUB_TR = construct_gff_interval("unknown",
                                 0,
                                 2,
                                 'fake',
                                 score='0',
                                 strand="+",
                                 source='ff',
                                 frame='.',
                                 attrs=[("Name", "fake"),
                                        ("annotation", "None"),
                                        ("function", "None"),
                                        ("genesymbol", "fake"),
                                        ("distance", "NaN")])

### Annotate genomically ###


def get_anti(tr, mind):
    return [('anti_gene', tr.name),
            ('anti_genesymbol', tr.attrs['genesymbol']),
            ('anti_tss', str(mind))]
Exemplo n.º 10
0
def clarify(i1, i2, breakpoint, antisense):
    gap = int(i1.attrs['gap'])
    #clarify breakpoint
    #clarify strandness
    if (antisense):
        if (i1.strand == '+'):
            cs1 = construct_gff_interval(chrom=i2.chrom,
                                         start=i2.start + breakpoint,
                                         stop=i2.stop,
                                         feature='ch',
                                         score=i2.score,
                                         strand='-',
                                         source='.',
                                         frame='.',
                                         attrs=i2.attrs.items())
            cs2 = construct_gff_interval(chrom=i1.chrom,
                                         start=i1.start,
                                         stop=i1.stop + gap + breakpoint,
                                         feature='ch',
                                         score=i1.score,
                                         strand='-',
                                         source='.',
                                         frame='.',
                                         attrs=i1.attrs.items())
        else:
            cs1 = construct_gff_interval(chrom=i2.chrom,
                                         start=i2.start,
                                         stop=i2.stop - breakpoint,
                                         feature='ch',
                                         score=i2.score,
                                         strand='+',
                                         source='.',
                                         frame='.',
                                         attrs=i2.attrs.items())
            cs2 = construct_gff_interval(chrom=i1.chrom,
                                         start=i1.start - gap - breakpoint,
                                         stop=i1.stop,
                                         feature='ch',
                                         score=i1.score,
                                         strand='+',
                                         source='.',
                                         frame='.',
                                         attrs=i1.attrs.items())
    else:
        if (i1.strand == '+'):
            cs1 = construct_gff_interval(chrom=i1.chrom,
                                         start=i1.start,
                                         stop=i1.stop + gap + breakpoint,
                                         feature='ch',
                                         score=i1.score,
                                         strand=i1.strand,
                                         source='.',
                                         frame='.',
                                         attrs=i1.attrs.items())
            cs2 = construct_gff_interval(chrom=i2.chrom,
                                         start=i2.start + breakpoint,
                                         stop=i2.stop,
                                         feature='ch',
                                         score=i2.score,
                                         strand=i2.strand,
                                         source='.',
                                         frame='.',
                                         attrs=i2.attrs.items())
        else:
            cs1 = construct_gff_interval(chrom=i1.chrom,
                                         start=i1.start - gap - breakpoint,
                                         stop=i1.stop,
                                         feature='ch',
                                         score=i1.score,
                                         strand=i1.strand,
                                         source='.',
                                         frame='.',
                                         attrs=i1.attrs.items())
            cs2 = construct_gff_interval(chrom=i2.chrom,
                                         start=i2.start,
                                         stop=i2.stop - breakpoint,
                                         feature='ch',
                                         score=i2.score,
                                         strand=i2.strand,
                                         source='.',
                                         frame='.',
                                         attrs=i2.attrs.items())

    return cs1, cs2
Exemplo n.º 11
0
            
for (chrom, geneid), features in gene_id_dict.items():
    fd = dict([ (int(x.type == 'gene'), x) for x in features])
    transcript = fd[0]
    gene = fd[1]
    
    genesymbol = gene.qualifiers.get('gene', [geneid])[0]
    annotation = transcript.qualifiers['product'][0].replace(";", ' ')
    function = transcript.qualifiers.get('note', [''])[0].replace(";", ' ')
    strand = strand_dict[feature.location.strand]
    cds = "%d:%d" % (transcript.location.start, transcript.location.end)
    if(strand == '+'):
        distance = transcript.location.start - gene.location.start
    else:
        distance = gene.location.end - transcript.location.end
    
    newint = construct_gff_interval(chrom, gene.location.start, gene.location.end, transcript.type, score='0', strand=strand, attrs=[('ID', geneid), ('genesymbol', genesymbol), ('annotation', annotation), ('function', function), ('cds', cds), ('distance', distance), ('tss_variants', '1') ])
    sys.stdout.write(str(newint))
    
    
    
    #if('CDS' in [x.type for x in v]):
        
        #print()
        #print("-"*140)
        #print()
        #for el in v:
            #print(el)


#print(len(name2gene))

for ga in gene_annotation_list:
    parent_name = ga.attrs.get('Parent')
    if (parent_name):
        parent = name2gene.get(ga.attrs['Parent'])
        if (parent):
            if (parent.strand == '+'):
                distance = ga.start - parent.start
            else:
                distance = parent.stop - ga.stop
            distance = 0
            aint = construct_gff_interval(
                parent.chrom,
                parent.start,
                parent.stop,
                'gene',
                score='0',
                strand=parent.strand,
                source='ncbi_af',
                frame='.',
                attrs=[('ID', parent.attrs['ID']),
                       ('genesymbol', parent.attrs.get('gene', 'None')),
                       ('annotation', ga.attrs.get('Note', 'None')),
                       ('product', ga.attrs.get('product', 'None')),
                       ('cds', "%d:%d" % (ga.start + 1, ga.stop)),
                       ('tss_variants', '1'), ('distance', str(distance))])

            sys.stdout.write(str(aint))
Exemplo n.º 13
0
                    required=True,
                    type=str,
                    help="Path to the genome, fasta format")
parser.add_argument('--gff',
                    nargs='?',
                    default=False,
                    const=True,
                    type=bool,
                    help="If set, output is in gff format")
args = parser.parse_args()

genome = next(SeqIO.parse(args.genome, 'fasta')).seq

for interval in BedTool(args.path):
    seq = str(genome[interval.start:interval.end].upper())
    if (args.gff):
        sys.stdout.write(
            str(
                construct_gff_interval(interval.chrom,
                                       interval.start,
                                       interval.end,
                                       'un',
                                       score=interval.score,
                                       strand=interval.strand,
                                       source='un',
                                       frame='.',
                                       attrs=[('Name', interval.name),
                                              ('seq', seq)])))
    else:
        print("\t".join([str(x) for x in list(interval) + [seq]]))
Exemplo n.º 14
0
    bacteria2prophages[prophage.bacteria].append(prophage)

for bname, bannotation, blength in bacterial_annotation:
    for prophage in sorted(bacteria2prophages[bname], key=lambda x: x.start):
        scale = 1000 / blength
        host_start = round(prophage.start * scale)
        host_end = round(prophage.end * scale) + 1
        interval = construct_gff_interval(
            bname,
            prophage.start,
            prophage.end,
            'prophage',
            score='0',
            strand=prophage.strand,
            source='ovidiu',
            frame='.',
            attrs=[('host_length', str(blength)), ('host_family', bannotation),
                   ('Name', "_".join([
                       str(x) for x in (bname, prophage.start, prophage.end)
                   ])),
                   ('rlength',
                    "%1.4f" % ((prophage.end - prophage.start) / blength)),
                   ('integrase', str(prophage.integrase)),
                   ('host_start', "%d" % host_start),
                   ('host_end', "%d" % host_end)])
        if (host_start > 0 and host_end < 1000):
            sys.stdout.write(str(interval))
        else:
            sys.stderr.write(
                "\nProphage region is outside bacterial borders\n%s" %
                interval)
Exemplo n.º 15
0


parser = argparse.ArgumentParser(description='Converts transcripts raw file into ncbi format');
parser.add_argument('--transcripts', nargs = '?', required=True, type = str, help = "Path to the transcripts, raw format");
parser.add_argument('--genome', nargs = '?', required=True, type = str, help = "Path to the genome, fasta format");
args = parser.parse_args();

with open(args.genome) as f:
    chrom = next(f).strip()[1:]

with open(args.transcripts) as f:
    next(f);
    for l in f:
        a = [x.strip().replace('"', '').replace(';', '') for x in l.strip().split("\t")]
        aint = construct_gff_interval( chrom, int(a[2]), int(a[3]), 'gene', score='0', strand=a[4], source='ncbi_af', frame='.', attrs=[ ('ID', 'gene-%s' % a[0]), ('gene_biotype', 'protein_coding'), ('Name', 'gene-%s' % a[0]), ('product', a[1]) ]  );
        sys.stdout.write(str(aint))
        
#453-Cg-phage-CL31_S7_L001_R1_001_contig_1       un      gene    2       2422    0       +       .       ID=gene-CL_1_Hyp.; gene_biotype=protein_coding; Name=gene-CL_1_Hyp.; product=Protein (A. faecalis)
# ['CL_53', 'Hyp. Protein (Corynebacterium)', '44425', '44796', '-']


#aint = construct_gff_interval(parent.chrom, parent.start, parent.stop, 'gene', score='0', strand=parent.strand, source='ncbi_af', frame='.', attrs=[('ID', parent.attrs['ID']), ('genesymbol', parent.attrs.get('gene', 'None')), ('annotation', ga.attrs.get('Note', 'None')), ('product', ga.attrs.get('product', 'None')), ('cds', "%d:%d" % (ga.start+1, ga.stop)), ('tss_variants', '1'), ('distance', str(distance))  ] );
    
    
    
    
    
    
    
    
Exemplo n.º 16
0
                    help="adfsa")
args = parser.parse_args()

if (os.stat(args.path).st_size != 0):
    coverage_dict = coverage2dict(args.coverage)
    convolution = list(coverage2dict(args.convolution).values())[0]
    for peak in BedTool(args.path):
        top = int(peak.name)
        topcoverage = coverage_dict[peak.chrom][top]
        total_coverage = coverage_dict[peak.chrom][peak.start:peak.stop]
        newint = construct_gff_interval(peak.chrom,
                                        peak.start,
                                        peak.stop,
                                        'binding_peak',
                                        score=peak.score,
                                        strand=peak.strand,
                                        source='un',
                                        frame='.',
                                        attrs=[("Name", peak.name),
                                               ("topcoverage", topcoverage),
                                               ("total_coverage",
                                                total_coverage)])
        if (max(total_coverage) >= 1.1 * topcoverage):
            fontsize = 24
            fig, ax1 = plt.subplots(figsize=(16, 9))
            plt.tight_layout(rect=[0.1, 0.1, 0.95, 0.95])

            ax1.plot(total_coverage, 'b-')
            ax1.plot(top - peak.start, topcoverage, 'r*', linewidth=5)
            ax1.set_xlabel("position (nt)", fontsize=fontsize)
            ax1.set_ylabel('coverage', color='b', fontsize=fontsize)
            ax1.tick_params('y', colors='b')
Exemplo n.º 17
0
    else:
        d = d_plus
        s_strand = '+'
        cs = cs_plus

    peak2genestarts[interval.name] = start2genes[cs], d, s_strand

for interval in peaks:
    topcoverage, ldrop, rdrop, start, end = peak2cov[interval.name]
    lg = peak2genes[interval.name]
    sg, dg, s_strand = peak2genestarts[interval.name]
    anint = construct_gff_interval(interval.chrom,
                                   start,
                                   end,
                                   'peak',
                                   score=interval.score,
                                   strand=interval.strand,
                                   source='af_peak_detection',
                                   frame='.',
                                   attrs=[('Name', interval.name),
                                          ('topcoverage',
                                           "%1.3f" % topcoverage),
                                          ('ldrop', "%1.3f" % ldrop),
                                          ('rdrop', "%1.3f" % rdrop),
                                          ('genes', ",".join(lg)),
                                          ('start_genes', ",".join(sg)),
                                          ('start_gene_distance', dg),
                                          ('start_gene_strand', s_strand)])
    sys.stdout.write(str(anint))
#construct_gff_interval(chrom, start, stop, feature, score='0', strand='.', source='un', frame='.', attrs=[]);
Exemplo n.º 18
0
parser.add_argument('path',
                    metavar='N',
                    nargs='?',
                    type=str,
                    help="Path to the genbank file")
args = parser.parse_args()

strand_dict = {1: '+', -1: '-'}

#NC_003450.3     RefSeq  gene    4766    5302    .       +       .       ID=gene-NCgl0004;Dbxref=GeneID:1021292;Name=NCgl0004;gbkey=Gene;gene_biotype=protein_coding;gene_synonym=Cgl0005;locus_tag=NCgl0004

for seqrecord in SeqIO.parse(args.path, 'genbank'):
    chrom = seqrecord.name
    chrom = '453-Cg-phage-CL31_S7_L001_R1_001_contig_1'
    for feature in seqrecord.features:
        if (feature.type == 'CDS'):
            temp = feature.qualifiers['label'][0].replace(";", ":")
            a = temp.split(" ")
            geneid = "gene-%s" % a[0]
            product = " ".join(a[1:])
            newint = construct_gff_interval(
                chrom,
                feature.location.start,
                feature.location.end,
                'gene',
                score='0',
                strand=strand_dict[feature.location.strand],
                attrs=[('ID', geneid), ('gene_biotype', 'protein_coding'),
                       ('Name', geneid), ('product', product)])
            sys.stdout.write(str(newint))
Exemplo n.º 19
0
#NC_003888.3     RefSeq  gene    14044   14454   .       -       .       ID=gene-SCO0012;Dbxref=GeneID:1095452;Name=SCO0012;gbkey=Gene;gene_biotype=protein_coding;gene_synonym=SCJ30.07c;locus_tag=SCO0012

#NC_003888.3     RefSeq  CDS     14044   14454   .       -       0       ID=cds-NP_624373.1;Parent=gene-SCO0012;Dbxref=Genbank:NP_624373.1,GeneID:1095452;Name=NP_624373.1;Note=SCJ30.07c%2C unknown%2C len: 136 aa%3B predicted by GC Frameplot%2C Hidden Markov model and amino acid usage.;gbkey=CDS;locus_tag=SCO0012;product=hypothetical protein;protein_id=NP_624373.1;transl_table=11

# phiSco2_Bielefeld       Prodigal:2.6    CDS     1400    2647    .       -       0       ID=phiSco2_00003;Parent=phiSco2_00003_gene;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:Viruses2.aa:YP_009208303.1;locus_tag=phiSco2_00003;product=hypothetical protein SEA_AMELA_25

for interval in BedTool(args.path):
    if (interval[2] == 'CDS'):
        name = 'gene-' + interval.attrs['ID']
        gene = construct_gff_interval(interval.chrom,
                                      interval.start,
                                      interval.stop,
                                      'gene',
                                      score='0',
                                      strand=interval.strand,
                                      source='custom',
                                      frame='.',
                                      attrs=[('ID', name),
                                             ('gene_biotype', 'protein_coding')
                                             ])
        sys.stdout.write(str(gene))
        cds = construct_gff_interval(
            interval.chrom,
            interval.start,
            interval.stop,
            'CDS',
            score='0',
            strand=interval.strand,
            source='custom',
            frame='.',
Exemplo n.º 20
0
Arquivo: ncbi2cds.py Projeto: xjyx/afp
                    nargs='?',
                    type=str,
                    help="Path to the ncbi annotation file, gff format")
args = parser.parse_args()

curgene = None
cds_list = []

for interval in BedTool(args.path):
    if (interval[2] == 'gene'):
        curgene = interval
    if (interval[2] == 'CDS'):
        aint = construct_gff_interval(
            interval.chrom,
            interval.start,
            interval.stop,
            'cds',
            score='0',
            strand=interval.strand,
            source='.',
            frame='.',
            attrs=[('ID', interval.attrs['Parent']),
                   ('genesymbol', curgene.attrs['Name']),
                   ('annotation', interval.attrs.get('Note', 'None')),
                   ('product', interval.attrs.get('product', 'None'))])
        cds_list.append(aint)

cds_list.sort(key=lambda x: x.start)
for aint in cds_list:
    sys.stdout.write(str(aint))