Пример #1
0
def Augustus(tool_pred, genome):
    augustus_ORFs = collections.OrderedDict()
    genome_size = len(genome)
    genome_rev = revCompIterative(genome)
    with open(tool_pred, 'r') as Augustus_input:
        for line in Augustus_input:
            line = line.split()
            if len(line) == 12 and "CDS" in line[2]:
                start = int(line[3])
                stop = int(line[4])
                strand = line[6]
                if '-' in strand:  # Reverse Compliment starts and stops adjusted
                    r_start = genome_size - stop
                    r_stop = genome_size - start
                    startCodon = genome_rev[r_start:r_start + 3]
                    stopCodon = genome_rev[r_stop - 2:r_stop + 1]
                elif '+' in strand:
                    startCodon = genome[start - 1:start + 2]
                    stopCodon = genome[stop - 3:stop]
                po = str(start) + ',' + str(stop)
                orf = [strand, startCodon, stopCodon]
                augustus_ORFs.update({po: orf})

    augustus_ORFs = sortORFs(augustus_ORFs)
    return augustus_ORFs
Пример #2
0
def FGENESB(tool_pred, genome):
    FGENESB_ORFs = collections.OrderedDict()
    genome_size = len(genome)
    genome_rev = revCompIterative(genome)
    with open(tool_pred, 'r') as FGENESB_input:
        for line in FGENESB_input:
            if '>GENE' in line:
                line = line.split()
                if '2208' in line:
                    print("ss")
                if len(line) == 10 and ">GENE" in line[0]:
                    start = int(line[2])
                    stop = int(line[4])
                    strand = line[9]
                    if '-' in strand:  # Reverse Compliment starts and stops adjusted
                        r_start = genome_size - stop
                        r_stop = genome_size - start
                        startCodon = genome_rev[r_start:r_start + 3]
                        stopCodon = genome_rev[r_stop - 2:r_stop + 1]
                    elif '+' in strand:
                        startCodon = genome[start - 1:start + 2]
                        stopCodon = genome[stop - 3:stop]
                    po = str(start) + ',' + str(stop)
                    orf = [strand, startCodon, stopCodon]
                    FGENESB_ORFs.update({po: orf})

    FGENESB_ORFs = sortORFs(FGENESB_ORFs)
    return FGENESB_ORFs
Пример #3
0
def GeneMark_HA(tool_pred, genome):
    geneMark_HA_ORFs = collections.OrderedDict()
    genome_size = len(genome)
    genome_rev = revCompIterative(genome)
    with open(tool_pred, 'r') as GeneMark_HA_input:
        for line in GeneMark_HA_input:
            line = line.split()
            if len(line) >= 9 and "CDS" in line[5]:
                start = int(line[6])
                stop = int(line[7])
                strand = line[9]
                if '-' in strand:  # Reverse Compliment starts and stops adjusted
                    r_start = genome_size - stop
                    r_stop = genome_size - start
                    startCodon = genome_rev[r_start:r_start + 3]
                    stopCodon = genome_rev[r_stop - 2:r_stop + 1]
                elif '+' in strand:
                    startCodon = genome[start - 1:start + 2]
                    stopCodon = genome[stop - 3:stop]
                po = str(start) + ',' + str(stop)
                orf = [strand, startCodon, stopCodon]
                geneMark_HA_ORFs.update({po: orf})

    geneMark_HA_ORFs = sortORFs(geneMark_HA_ORFs)
    return geneMark_HA_ORFs
Пример #4
0
def GLIMMER_3(tool_pred, genome):
    GLIMMER_ORFs = collections.OrderedDict()
    genome_size = len(genome)
    genome_rev = revCompIterative(genome)
    with open(
            tool_pred, 'r'
    ) as glimmer_input:  # GLIMMER_3 reverses the start and stop positions for ORFS on the negative strand
        for line in glimmer_input:
            if '>' not in line:  # This will not work with multiple contigs
                line = line.split()
                if len(line) == 5 and "orf" in line[0]:
                    if '-' in line[
                            3]:  # Reverse Compliment starts and stops adjusted -  Switched to match Sense Strand
                        start = int(line[2])
                        stop = int(line[1])
                        strand = '-'
                        r_start = genome_size - stop
                        r_stop = genome_size - start
                        startCodon = genome_rev[r_start:r_start + 3]
                        stopCodon = genome_rev[r_stop - 2:r_stop + 1]
                    elif '+' in line[3]:
                        start = int(line[1])
                        stop = int(line[2])
                        strand = '+'
                        startCodon = genome[start - 1:start + 3]
                        stopCodon = genome[stop - 3:stop]
                    po = str(start) + ',' + str(stop)
                    orf = [strand, startCodon, stopCodon]
                    GLIMMER_ORFs.update({po: orf})

    GLIMMER_ORFs = sortORFs(GLIMMER_ORFs)
    return GLIMMER_ORFs
Пример #5
0
def GFF(tool_pred, genome):
    GFF_ORFs = collections.OrderedDict()
    genome_size = len(genome)
    genome_rev = revCompIterative(genome)
    with open(tool_pred, 'r') as gff_input:
        for line in gff_input:
            if '#' not in line:
                line = line.split('\t')
                if "CDS" in line[2] and len(line) == 9:
                    start = int(line[3])
                    stop = int(line[4])
                    strand = line[6]
                    if '-' in strand:  # Reverse Compliment starts and stops adjusted
                        r_start = genome_size - stop
                        r_stop = genome_size - start
                        startCodon = genome_rev[r_start:r_start + 3]
                        stopCodon = genome_rev[r_stop - 2:r_stop + 1]
                    elif '+' in strand:
                        startCodon = genome[start - 1:start + 2]
                        stopCodon = genome[stop - 3:stop]
                    po = str(start) + ',' + str(stop)
                    orf = [strand, startCodon, stopCodon]
                    GFF_ORFs.update({po: orf})
                elif "CDS" in line[2]:
                    sys.exit("SAS")

    GFF_ORFs = sortORFs(GFF_ORFs)
    return GFF_ORFs
Пример #6
0
def GeneMark(tool_pred, genome):
    geneMark_ORFs = collections.OrderedDict()
    genome_size = len(genome)
    genome_rev = revCompIterative(genome)
    prev_Start = 0
    prev_Stop = 0
    started = False
    with open(tool_pred, 'r') as GeneMark_input:
        for line in GeneMark_input:
            line = line.split()
            if len(line) == 7:
                started = True
                if 'direct' in line[2] or 'complement' in line[
                        2]:  # Strange Output requires strange code - We select the Longest ORF from each set
                    start = int(line[0])
                    stop = int(line[1])
                    strand = line[2]
                    if 'complement' in strand:  # Reverse Compliment starts and stops adjusted
                        if start != prev_Start:
                            r_start = genome_size - stop
                            r_stop = genome_size - start
                            strand = '-'
                            startCodon = genome_rev[r_start:r_start + 3]
                            stopCodon = genome_rev[r_stop - 2:r_stop + 1]
                            po = str(start) + ',' + str(stop)
                            orf = [strand, startCodon, stopCodon]
                            geneMark_ORFs.update({po: orf})
                    elif 'direct' in strand:
                        if stop != prev_Stop:
                            startCodon = genome[start - 1:start + 2]
                            stopCodon = genome[stop - 3:stop]
                            strand = '+'
                            po = str(start) + ',' + str(stop)
                            orf = [strand, startCodon, stopCodon]
                            geneMark_ORFs.update({po: orf})
                    prev_Start = start
                    prev_Stop = stop
            elif len(line) == 0 and started == True:
                prev_Stop = 0
                prev_Start = 0

    geneMark_ORFs = sortORFs(geneMark_ORFs)
    return geneMark_ORFs