示例#1
0
def sanitize_gff_file(gff_fname,
                      in_memory=True,
                      in_place=False):
    """
    Sanitize a GFF file.
    """
    db = None
    if is_gff_db(gff_fname):
        # It's a database filename, so load it
        db = gffutils.FeatureDB(gff_fname)
    else:
        # Need to create a database for file
        if in_memory:
            db = gffutils.create_db(gff_fname, ":memory:",
                                    verbose=False)
        else:
            db = get_gff_db(gff_fname)
    if in_place:
        gff_out = gffwriter.GFFWriter(gff_fname,
                                      in_place=in_place)
    else:
        gff_out = gffwriter.GFFWriter(sys.stdout)
    sanitized_db = sanitize_gff_db(db)
    for gene_rec in sanitized_db.all_features(featuretype="gene"):
        gff_out.write_gene_recs(sanitized_db, gene_rec.id)
    gff_out.close()
示例#2
0
def add_EVM(final_update, wd, consensus_mapped_gff3):

    """
    """

    db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")]

    uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap]

    mRNA = []
    for evm in uniq_evm:
        for line in db_evm.parents(evm, order_by='start'):
            mRNA.append(line.attributes["ID"][0])
    mRNA_uniq = list(set(mRNA))
    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in mRNA_uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name
示例#3
0
def add_removed_evm(pasa, exon, wd):
    """
    here the clusters of sequence from the same locus are prepared
    """

    db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")]


    uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap]
    uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)]
    uniq = list(set(uniq_gene))


    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name
示例#4
0
def longest_cds(gff_file, gff_filerc, verbose, wd, filename):
    db = gffutils.create_db(gff_file,
                            ':memory:',
                            merge_strategy='create_unique',
                            keep_order=True,
                            transform=transform)
    dbrc = gffutils.create_db(gff_filerc,
                              ':memory:',
                              merge_strategy='create_unique',
                              keep_order=True,
                              transform=transform)
    list_mrna = [
        mRNA.attributes["ID"][0] for mRNA in db.features_of_type('mRNA')
    ]
    list_mrna_rc = [
        mRNA.attributes["ID"][0] for mRNA in dbrc.features_of_type('mRNA')
    ]
    list_all = list(set(list_mrna + list_mrna_rc))
    list_db = []
    list_db_rc = []
    for mrna_id in list_all:
        cds_len = [
            int(i.end) - int(i.start)
            for i in db.children(mrna_id, featuretype='CDS', order_by='start')
        ]
        cds_len_rc = [
            int(i.end) - int(i.start) for i in dbrc.children(
                mrna_id, featuretype='CDS', order_by='start')
        ]
        if cds_len == cds_len_rc:
            list_db.append(mrna_id)
        elif cds_len > cds_len_rc:
            list_db.append(mrna_id)
        else:
            list_db_rc.append(mrna_id)
    gff_out = gffwriter.GFFWriter(filename)
    for evm in list_db:
        if evm in list_mrna:
            for i in db.children(evm, featuretype='CDS', order_by='start'):
                gff_out.write_rec(i)
            i = db[evm]
            gff_out.write_rec(i)
            for i in db.parents(evm, featuretype='gene', order_by='start'):
                gff_out.write_rec(i)
            for i in db.children(evm, featuretype='exon', order_by='start'):
                gff_out.write_rec(i)
    for evm in list_db_rc:
        if evm in list_mrna_rc:
            for i in dbrc.children(evm, featuretype='CDS', order_by='start'):
                gff_out.write_rec(i)
            i = dbrc[evm]
            gff_out.write_rec(i)
            for i in dbrc.parents(evm, featuretype='gene', order_by='start'):
                gff_out.write_rec(i)
            for i in dbrc.children(evm, featuretype='exon', order_by='start'):
                gff_out.write_rec(i)
    gff_out.close()
    if verbose:
        print(filename)
    return filename
示例#5
0
def genename_last(gff_filename, prefix, verbose, wd, dict_ref_name, step):

    global prefix_name
    prefix_name = prefix
    out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w")
    gt_com = GT_GFF3 % gff_filename
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True)
    gt_call.communicate()

    db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_name)
    gene_count = 0
    list_mrna = [mRNA.attributes["ID"][0] for mRNA in db1.features_of_type('mRNA')]
    out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread", suffix=".gff3", dir=wd)
    gff_out = gffwriter.GFFWriter(out_gff.name)
    gene_name = []
    for evm in list_mrna:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            gff_out.write_rec(i)
        i = db1[evm]
        if i.chrom in dict_ref_name:
            i.chrom = dict_ref_name[i.chrom]
        gff_out.write_rec(i)
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            id_gene = i.attributes['ID'][0]
            if not id_gene in gene_name:
                gff_out.write_rec(i)
                gene_name.append(id_gene)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            gff_out.write_rec(i)
    gff_out.close()
    if "pasa" in step:
        out_name = os.path.join(wd, "Final.evm.update.gff3")
        with open(out_name, "w") as fh:
            gt_com = GT_RETAINID % out_gff.name
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % gt_com)
            gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
            gt_call.communicate()
    if "lorean" in step:
        out_name = os.path.join(wd, "Final.LoReAn.update.gff3")
        with open(out_name, "w") as fh:
            gt_com = GT_RETAINID % out_gff.name
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % gt_com)
            gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
            gt_call.communicate()
    if verbose:
        print(out_name)
    return out_name
示例#6
0
def test_gffwriter():
    """
    Test GFFWriter.
    """
    print("Testing GFF writer..")
    fn = gffutils.example_filename("unsanitized.gff")
    # Make a copy of it as temporary named file
    temp_f = tempfile.NamedTemporaryFile(delete=False)
    temp_fname_source = temp_f.name
    shutil.copy(fn, temp_fname_source)
    # Now write file in place
    source_first_line = open(temp_fname_source, "r").readline().strip()
    assert (not source_first_line.startswith("#GFF3")), \
           "unsanitized.gff should not have a gffutils-style header."
    db_in = gffutils.create_db(fn, ":memory:", keep_order=True)
    # Fetch first record
    rec = six.next(db_in.all_features())
    ##
    ## Write GFF file in-place test
    ##
    print("Testing in-place writing")
    gff_out = gffwriter.GFFWriter(temp_fname_source,
                                  in_place=True,
                                  with_header=True)
    gff_out.write_rec(rec)
    gff_out.close()
    # Ensure that the file was written with header
    rewritten = open(temp_fname_source, "r")
    new_header = rewritten.readline().strip()
    assert new_header.startswith("#GFF3"), \
           "GFFWriter serialized files should have a #GFF3 header."
    print("  - Wrote GFF file in-place successfully.")
    ##
    ## Write GFF file to new file test
    ##
    print("Testing writing to new file")
    new_file = tempfile.NamedTemporaryFile(delete=False)
    gff_out = gffwriter.GFFWriter(new_file.name)
    gff_out.write_rec(rec)
    gff_out.close()
    new_line = open(new_file.name, "r").readline().strip()
    assert new_line.startswith("#GFF3"), \
           "GFFWriter could not write to a new GFF file."
    print("  - Wrote to new file successfully.")
def output_combined_gff_events(sg_gff_fname,
                               sg_events,
                               new_gff_fname,
                               output_gff_fname,
                               genome,
                               sg_label="sg2008",
                               source_attr="source"):
    """
    Output the given events from sg_gff_fname and all of
    the entries from new_gff_fname into a single file.

    Mark SG events with sg_label.
    """
    gff_out_file = open(output_gff_fname, "w")
    gff_out = gffwriter.GFFWriter(gff_out_file)
    # SG records to output
    sg_records = []
    # New records to output
    new_records = []
    # Load up gffutils databases for SG and new events
    new_db = gffutils.create_db(new_gff_fname, ":memory:", verbose=False)
    sg_db = gffutils.create_db(sg_gff_fname, ":memory:", verbose=False)
    #sg_gff_genes = sg_db.features_of_type("gene")
    new_gff_genes = new_db.features_of_type("gene")
    # Output new events first
    new_ids = {}
    for gene_rec in new_gff_genes:
        gene_id = gene_rec.id
        gff_out.write_gene_recs(new_db, gene_id)
        new_ids[gene_id] = True
    # Output SG events
    for sg_gene_id in sg_events:
        if sg_gene_id in new_ids:
            print "Skipping %s" % (sg_gene_id)
            # If this has an identical ID to one of the new annotation
            # events, skip it
            continue
        # Get all SG event records
        sg_recs = get_event_gff_recs(sg_gene_id, sg_db)
        # Add source attribute to each record
        for rec in sg_recs:
            rec.attributes[source_attr] = [sg_label]
            gff_out.write_rec(rec)
    gff_out.close()
    # Sanitize the file
    sanitize_cmd = \
        "gffutils-cli sanitize %s --in-memory --in-place" %(output_gff_fname)
    print "Sanitizing merged GFF..."
    ret_val = os.system(sanitize_cmd)
    if ret_val != 0:
        raise Exception, "Sanitization failed on %s" % (output_gff_fname)
    # Annotate the file
    print "Annotating merged GFF..."
    gffutils_helpers.annotate_gff(output_gff_fname, genome)
示例#8
0
def pep_seq(myFasta, final_evm):
    fasta = {}
    db = gffutils.create_db(final_evm, ':memory:', merge_strategy="create_unique", keep_order=True)
    gff_file = final_evm + ".gff3"
    gff_out = gffwriter.GFFWriter(gff_file)
    for t in db.features_of_type('mRNA', order_by='start'):
        position = []
        seq_combined = ''
        j = 0
        for i in db.children(t, featuretype='CDS', order_by='start'):
            j += 1
            if j == 1:
                pphase = i[7]
            seq = i.sequence(myFasta, use_strand=False)
            seq_combined += seq
            position = position + [i.start,i.stop]
        seq_combined = SeqRecord(Seq(seq_combined, generic_dna))
        #print(t.attributes["ID"][0])
        #print(seq_combined.seq)
        if t.strand == '-':
            pphase = i[7]
            seq_combined = seq_combined.reverse_complement()
        if pphase == "0" or pphase == ".":
            seq_transl = seq_combined.translate()
        elif pphase == "1":
            seq_transl = seq_combined[1:].translate()
        elif pphase == "2":
            seq_transl = seq_combined[2:].translate()
        seq_transl.description = position
        seq_transl.id = t.id
        position = sorted(position)
        t.start = position[0]
        t.stop = position[-1]
        fasta[str(seq_transl.seq)] = [seq_transl, t]
    count = 0
    for key in fasta:
        for i in db.parents(fasta[key][1], featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
            i.start = fasta[key][1].start
            i.stop = fasta[key][1].stop
        gff_out.write_rec(fasta[key][1])
        for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
            count += 1
            i.featuretype="exon"
            if "ID" in i.attributes:
                i.attributes["ID"][0] = i.attributes["ID"][0] + "-" + str(count)
            else:
                i.attributes["ID"] = ["exon" + "-" + str(count)]
            gff_out.write_rec(i)
    return (gff_file)
示例#9
0
def genename_lorean(gff_filename, verbose, wd):


    outfile_gff = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd)
    log = tempfile.NamedTemporaryFile(delete=False, prefix="uniq.ID.pasa.", suffix=".log", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, prefix="uniq.ID.pasa.", suffix=".err", dir=wd)

    cmd = GFFREAD_M % (outfile_gff.name, gff_filename)

    if verbose:
        sys.stderr.write('Executing: %s\n\n' % cmd)
    gffread = subprocess.Popen(cmd, cwd=wd, shell=True, stdout=log, stderr=err)
    gffread.communicate()

    out_final = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix = "gt_gff3.", suffix=".gff3", dir=wd)
    log = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".log", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd, suffix=".last.gt_gff3.err")

    gt_com = 'gt gff3 -retainids -sort -force -tidy -o %s %s' % (out_final.name, outfile_gff.name)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=log, stderr=err, shell=True)
    gt_call.communicate()

    db_gffread = gffutils.create_db(out_final.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_cds)
    outfile_out = tempfile.NamedTemporaryFile(delete=False, prefix="uniq.ID.pasa.final.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile_out.name)

    for gene in db_gffread.features_of_type("gene"):
        gff_out_s.write_rec(db_gffread[gene])
        for i in db_gffread.children(gene, order_by='start'):
            gff_out_s.write_rec(i)

    if verbose:
        print(outfile_out.name)

    out_1 = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix="gt_gff3.", suffix=".gff3", dir=wd)
    log = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".log", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd, suffix=".last.gt_gff3.err")

    gt_com = 'gt gff3 -retainids -sort -force -tidy -o %s %s' % (out_1.name, outfile_out.name)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=log, stderr=err, shell=True)
    gt_call.communicate()
    return out_1.name
示例#10
0
def exonerate(ref, gff_file, proc, wd, verbose):

    ##THIS removes the warning. the check of the longest protein was giving a warining. if Biopython change, this could be a problem

    warnings.filterwarnings("ignore")

    exon_file_out = gff_file + ".exons.fasta"
    prot_file_out = gff_file + ".prot.fasta"
    errorFile = gff_file + ".gffread_err.log"
    logFile = exon_file_out + "gffread_log.log"
    com = GFFREAD_W % (ref, exon_file_out, prot_file_out, gff_file)
    fasta_file_outfile = open(logFile, "w")
    errorFilefile = open(errorFile, "w")
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com,
                            stdout=fasta_file_outfile,
                            cwd=wd,
                            stderr=errorFilefile,
                            shell=True)
    call.communicate()
    fasta_file_outfile.close()
    errorFilefile.close()

    listComplete = []
    dictIncomplete = {}
    dictFastaProt = {}
    longestProt = []
    listTotal = []
    listSingleExons = []
    testDict = {}

    for record in SeqIO.parse(prot_file_out, "fasta"):
        listTotal.append(record.id)
        if record.seq.startswith("M"):  # and record.seq.endswith("."):
            listComplete.append(record.id)
        else:
            dictIncomplete[record.id] = record.id
    for record in SeqIO.parse(exon_file_out, "fasta"):
        listFields = record.description.split(' ')
        for elem in listFields:
            if elem.startswith('exons'):
                exonNumber = elem.split(",")
                ## changed here for all genes
                if (len(exonNumber)) > 0:
                    listSingleExons.append(record.id)
                    if record.id in dictIncomplete:
                        newrecord = record.reverse_complement()
                        input_seq = str(record.seq)
                        startP = re.compile('ATG')
                        nuc = input_seq.replace('\n', '')
                        longest = (0, )
                        for m in startP.finditer(nuc):
                            if len(
                                    Seq.Seq(nuc)[m.start():].translate(
                                        to_stop=True)) > longest[0]:
                                pro = Seq.Seq(nuc)[m.start():].translate(
                                    to_stop=True)
                                longest = [
                                    len(pro),
                                    m.start(),
                                    str(pro),
                                    nuc[m.start():m.start() + len(pro) * 3 + 3]
                                ]
                                if len(longest) == 4:
                                    record.seq = Seq.Seq(longest[2])
                                    dictFastaProt[record.id] = record
                                else:
                                    dictFastaProt[record.id] = record
                        input_seq = str(newrecord.seq)
                        startP = re.compile('ATG')
                        nuc = input_seq.replace('\n', '')
                        longest = (0, )
                        for m in startP.finditer(nuc):
                            if len(
                                    Seq.Seq(nuc)[m.start():].translate(
                                        to_stop=True)) > longest[0]:
                                pro = Seq.Seq(nuc)[m.start():].translate(
                                    to_stop=True)
                                longest = [
                                    len(pro),
                                    m.start(),
                                    str(pro),
                                    nuc[m.start():m.start() + len(pro) * 3 + 3]
                                ]
                                if len(longest) == 4:
                                    if record.id in dictFastaProt:
                                        if (len((dictFastaProt[record.id]).seq)
                                            ) < (len(longest[2])):
                                            record.seq = Seq.Seq(longest[2])
                                            dictFastaProt[record.id] = record
                                    elif len(longest) == 4:
                                        record.seq = Seq.Seq(longest[2])
                                        dictFastaProt[record.id] = record
                                    else:
                                        dictFastaProt[record.id] = record

    for mod in dictFastaProt:
        longestProt.append(dictFastaProt[mod])
    prot_file_out_mod = prot_file_out + ".mod.fasta"
    SeqIO.write(longestProt, prot_file_out_mod, "fasta")

    manage = Manager()
    queue = manage.Queue()
    pool = Pool(processes=int(proc), maxtasksperchild=10000)

    commandList = []
    listShort = []
    record_dict = SeqIO.to_dict(SeqIO.parse(exon_file_out, "fasta"))
    for key in dictFastaProt:
        if key in record_dict:
            listShort.append(key)
            outputFilenameProt = wd + key + '.prot.fasta'
            SeqIO.write(dictFastaProt[key], outputFilenameProt, "fasta")
            listFields = record_dict[key].description.split(' ')
            for elem in listFields:
                outputFilename = wd + key + '.genome.fasta'
                bedFile = wd + key + '.genome.bed'
                if (elem.startswith('loc')
                        and elem.endswith('+')) or (elem.startswith('loc')
                                                    and elem.endswith('-')):
                    coordsList = elem.split('|', -2)
                    chrN = coordsList[0].split(':')
                    coord = coordsList[1].split('-')
                    locus = '\t'.join([chrN[1], coord[0], coord[1]])
                    locus = locus + '\n'
                    bedhandler = open(bedFile, 'w')
                    bedhandler.write(locus)
                    bedhandler.close()
                    com = BEDTOOLS_GET_FASTA % (ref, bedFile, outputFilename)
                    if verbose:
                        sys.stderr.write('Executing: %s\n\n' % com)
                    call = subprocess.Popen(
                        com, cwd=wd, shell=True
                    )  # , stdout= fasta_file_outfile , stderr=errorFilefile)
                    call.communicate()
                    combList = [
                        outputFilenameProt, outputFilename, verbose, queue
                    ]
            commandList.append(combList)

    results = pool.map_async(runExonerate, commandList)
    with progressbar.ProgressBar(max_value=len(commandList)) as bar:
        while not results.ready():
            size = queue.qsize()
            bar.update(size)
            time.sleep(1)

    outputFilenameGff = wd + 'mRNA_complete_gene_Annotation.gff3'
    exonerate_files = results._value + [outputFilenameGff]

    listInGff = listComplete + listShort
    listAsbent = sorted(set(list(set(listTotal) ^ set(listInGff))))
    listCompleteAll = listAsbent + listComplete

    gff_out = gffwriter.GFFWriter(outputFilenameGff)
    db1 = gffutils.create_db(gff_file,
                             ':memory:',
                             merge_strategy='create_unique',
                             keep_order=True)
    for evm in listCompleteAll:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db1[evm])
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)
    gff_out.close()

    orintedFIleN = wd + '/oriented.oldname.gff3'
    with open(orintedFIleN, 'wb') as wfd:
        for f in exonerate_files:
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)

    return orintedFIleN
示例#11
0
def longest(gff_file, fasta, proc, wd, verbose):
    outputFilenameLeft = tempfile.NamedTemporaryFile(delete=False, dir=wd, prefix="longest.")
    gff_out = gffwriter.GFFWriter(outputFilenameLeft.name)

    gt_com = GT_GFF3_INTRON % gff_file
    gff_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out, "w")
    errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w")
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=gff_file_outfile, stderr=errorFilefile, shell=True)
    gt_call.communicate()

    gt_com = GT_GFF3TOGTF % gff_file_outfile.name
    gtf_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gtf_file_out, "w")
    errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w")
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=gtf_file_outfile, stderr=errorFilefile, shell=True)
    gt_call.communicate()


    db1 = gffutils.create_db(gff_file_outfile.name, ':memory:', merge_strategy='create_unique', keep_order=True)


    fasta_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(fasta_file_out, "w")
    errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w")
    com = 'cufflinks_gtf_genome_to_cdna_fasta.pl %s %s' % (gtf_file_outfile.name, fasta)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com, stdout=fasta_file_outfile, stderr=errorFilefile, shell=True)
    call.communicate()

    gff_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out_u, "w")
    errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w")
    com = 'cufflinks_gtf_to_alignment_gff3.pl %s' % gtf_file_outfile.name
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com, stdout=gff_file_outfile, stderr=errorFilefile, shell=True)
    call.communicate()

    gff_file_outfile_1 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out, "w")
    errorFilefile_1 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w")
    com = 'TransDecoder.LongOrfs -m 10 -t %s' % fasta_file_outfile.name
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com, stdout=gff_file_outfile_1, stderr=errorFilefile_1, cwd=wd, shell=True)
    call.communicate()

    gff_file_outfile_2 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out, "w")
    errorFilefile_2 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w")
    wd_fasta = fasta_file_outfile.name
    com = 'TransDecoder.Predict --single_best_orf --cpu %s --retain_long_orfs 10 -t %s' % (proc, wd_fasta)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com, stdout=gff_file_outfile_2, stderr=errorFilefile_2, cwd=wd, shell=True)
    call.communicate()

    gff_file_outfile_3 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.") #open(outputFilename, "w")
    errorFilefile_3 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.") #open(errorFile, "w")
    transdecoder = tempfile.NamedTemporaryFile(delete=False)

    com = 'cdna_alignment_orf_to_genome_orf.pl %s %s %s' % (wd_fasta + '.transdecoder.gff3', gff_file_outfile.name, wd_fasta)

    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com, stdout=gff_file_outfile_3, stderr=errorFilefile_3, cwd=wd, shell=True)
    call.communicate()


    listErr = []
    err_file = open(errorFilefile.name, "r")
    for line in err_file:
        if line.startswith("Warning"):
            listErr.append(("mRNA" + line.split("::")[1]).split(".")[0])
    listErrUniq = list(set(listErr))

    for evm in listErrUniq:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db1[evm])
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)

    gff_files = [outputFilenameLeft.name, gff_file_outfile.name]
    outputFilenameFinal = wd + 'finalAnnotation.Transdecoder.gff3'



    with open(outputFilenameFinal, 'wb') as wfd:
        for f in gff_files:
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)

    return outputFilenameFinal
示例#12
0
def genename_evm(gff_filename, verbose, wd):

    gene_evm = "evm.TU."
    mRNA_evm = "evm.model."
    out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w")
    gt_com = GT_GFF3 % gff_filename
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True)
    gt_call.communicate()

    db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True)
    list_gene = [gene.attributes["ID"][0] for gene in db1.features_of_type('gene')]
    list_chr = [chr.chrom for chr in db1.features_of_type('gene')]
    chr_count_gene = {}
    chr_count_mRNA = {}

    chrs = list(set(list_chr))
    for elm in chrs:
        chr_count_gene[elm] = 0
        chr_count_mRNA[elm] = 0

    out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread_parse", suffix=".gff3", dir=wd)
    gff_out = gffwriter.GFFWriter(out_gff.name)
    for evm in list_gene:
        exon_count = 0
        cds_count = 0
        gene_chr = db1[evm].chrom
        gene = db1[evm]
        count_gene = chr_count_gene[gene_chr] + 1
        chr_count_gene[gene_chr] = count_gene
        id_new_gene = gene_evm + gene_chr + "." + str(count_gene)
        gene.attributes["ID"][0] = id_new_gene
        gff_out.write_rec(gene)
        for i in db1.children(evm, featuretype='mRNA', order_by='start'):
            mRNA = i
            mRNA_old = i.attributes["ID"][0]
            count_mRNA = chr_count_mRNA[gene_chr] + 1
            chr_count_mRNA[gene_chr] = count_mRNA
            id_new_mRNA = mRNA_evm + gene_chr + "." + str(count_mRNA)
            mRNA.attributes["Parent"][0] = id_new_gene
            mRNA.attributes["ID"][0] = id_new_mRNA
            gff_out.write_rec(mRNA)
            for e in db1.children(mRNA_old, featuretype='exon', order_by='start'):
                exon_count += 1
                exon = e
                exon.attributes["Parent"][0] = mRNA.attributes["ID"][0]
                exon.attributes["ID"] = id_new_mRNA + ".exon" + str(exon_count)
                gff_out.write_rec(exon)
            for c in db1.children(mRNA_old, featuretype='CDS', order_by='start'):
                cds_count += 1
                cds = c
                cds.attributes["Parent"][0] = mRNA.attributes["ID"][0]
                cds.attributes["ID"] = "cds." + str(cds_count) + "." + id_new_mRNA
                gff_out.write_rec(cds)
    gff_out.close()

    out = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix="new_name_update.", suffix= ".gff3", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd)
    gt_com = GT_GFF3_R % out_gff.name
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True)
    gt_call.communicate()
    if verbose:
        print(out.name)
    return out.name
示例#13
0
def exonerate(ref, gff_file, proc, wd, verbose):

    ##THIS removes the warning. the check of the longest protein was giving a warining. if Biopython change, this could be a problem

    warnings.filterwarnings("ignore")


    exon_file_out = gff_file + ".exons.fasta"
    prot_file_out = gff_file + ".prot.fasta"
    errorFile = gff_file + ".gffread_err.log"
    logFile = exon_file_out + "gffread_log.log"
    com = GFFREAD_W % (ref, exon_file_out, prot_file_out, gff_file)
    fasta_file_outfile = open(logFile, "w")
    errorFilefile = open(errorFile, "w")
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % com)
    call = subprocess.Popen(com, stdout=fasta_file_outfile, cwd=wd, stderr=errorFilefile, shell=True)
    call.communicate()
    fasta_file_outfile.close()
    errorFilefile.close()
    list_complete = []
    dict_incomplete = {}
    dict_fasta_prot = {}
    longest_prot = []
    list_total = []
    list_single_exons = []
    list_incomplete = []
    for record in SeqIO.parse(prot_file_out, "fasta"):
        list_total.append(record.id)
        if record.seq.startswith("M"):
            list_complete.append(record.id)
        else:
            list_incomplete.append(record.id)
            dict_incomplete[record.id] = record.id
    for record in SeqIO.parse(exon_file_out, "fasta"):
        list_fields = record.description.split(' ')
        for elem in list_fields:
            if elem.startswith('exons'):
                exon_number = elem.split(",")
                if (len(exon_number)) > 0:
                    list_single_exons.append(record.id)
                    if record.id in dict_incomplete:
                        newrecord = record.reverse_complement()
                        input_seq = str(record.seq)
                        startP = re.compile('ATG')
                        nuc = input_seq.replace('\n', '')
                        longest = (0,)
                        for m in startP.finditer(nuc):
                            if len(Seq.Seq(nuc)[m.start():].translate(to_stop=True)) > longest[0]:
                                pro = Seq.Seq(nuc)[m.start():].translate(to_stop=True)
                                longest = [len(pro), nuc[m.start():m.start() + len(pro) * 3 + 3]]
                                if len(longest) == 2:
                                    record.seq = Seq.Seq(longest[1])
                                    dict_fasta_prot[record.id] = record
                                else:
                                    dict_fasta_prot[record.id] = record
                        input_seq = str(newrecord.seq)
                        startP = re.compile('ATG')
                        nuc = input_seq.replace('\n', '')
                        longest = (0,)
                        for m in startP.finditer(nuc):
                            if len(Seq.Seq(nuc)[m.start():].translate(to_stop=True)) > longest[0]:
                                pro = Seq.Seq(nuc)[m.start():].translate(to_stop=True)
                                longest = [len(pro), nuc[m.start():m.start() + len(pro) * 3 + 3]]
                                if len(longest) == 2:
                                    if record.id in dict_fasta_prot:
                                        if (len((dict_fasta_prot[record.id]).seq)) < (len(longest[1])):
                                            record.seq = Seq.Seq(longest[1])
                                            dict_fasta_prot[record.id] = record
                                    elif len(longest) == 2:
                                        record.seq = Seq.Seq(longest[1])
                                        dict_fasta_prot[record.id] = record
                                    else:
                                        dict_fasta_prot[record.id] = record

    for mod in dict_fasta_prot:
        longest_prot.append(dict_fasta_prot[mod])
    prot_file_out_mod = prot_file_out + ".mod.fasta"
    SeqIO.write(longest_prot, prot_file_out_mod, "fasta")

    pool = Pool(processes=int(proc))
    list_get_seq= []
    list_short = []
    record_dict = SeqIO.to_dict(SeqIO.parse(exon_file_out, "fasta"))
    for key in dict_fasta_prot:
        if key in record_dict:
            list_short.append(key)
            output_filename_prot = os.path.join(wd, key + '.prot.fasta')
            SeqIO.write(dict_fasta_prot[key], output_filename_prot, "fasta")
            list_fields = record_dict[key].description.split(' ')
            for elem in list_fields:
                output_filename = wd + key + '.genome.fasta'
                bedFile = wd + key + '.genome.bed'
                if (elem.startswith('loc') and elem.endswith('+')) or (elem.startswith('loc') and elem.endswith('-')):
                    coordsList = elem.split('|', -2)
                    chrN = coordsList[0].split(':')
                    coord = coordsList[1].split('-')
                    locus = '\t'.join([chrN[1], coord[0], coord[1]])
                    locus = locus + '\n'
                    bedhandler = open(bedFile, 'w')
                    bedhandler.write(locus)
                    bedhandler.close()
                    data = [ref, bedFile, output_filename, output_filename_prot, verbose, wd]
                    list_get_seq.append(data)

    results_get = pool.map(get_fasta, list_get_seq, chunksize=1)
    results = pool.map(runExonerate, results_get, chunksize=1)
    output_filename_gff = wd + 'mRNA_complete_gene_Annotation.gff3'


    gff_out = gffwriter.GFFWriter(output_filename_gff)
    db1 = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True)

    list_gene_complete = []
    list_gene_incomplete = []
    for mRNA in list_complete:
        for mRNA_ok in db1.parents(mRNA, featuretype='gene', order_by='start'):
            list_gene_complete.append(mRNA_ok.attributes["ID"][0])
    for mRNA in list_incomplete:
        for mRNA_ok in db1.parents(mRNA, featuretype='gene', order_by='start'):
            list_gene_incomplete.append(mRNA_ok.attributes["ID"][0])
    list_gene_complete = sorted(list(set(list_gene_complete)))
    list_gene_incomplete = sorted(list(set(list_gene_incomplete)))
    list_gene_ok_uniq = sorted(list(set(list_gene_complete) - set(list_gene_incomplete)))
    for evm in list_gene_ok_uniq:
        gff_out.write_rec(db1[evm])
        for i in db1.children(evm):
            gff_out.write_rec(i)

    exonerate_files = results + [output_filename_gff]
    gff_out.close()
    orintedFIleN = wd + '/oriented.oldname.gff3'
    with open(orintedFIleN, 'wb') as wfd:
        for f in exonerate_files:
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)
    outfile_gff = tempfile.NamedTemporaryFile(delete=False, prefix="additional.2.", suffix=".gff3", dir=wd)
    log = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".log", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".err", dir=wd)
    cmd = GFFREAD_M_S % (outfile_gff.name, orintedFIleN)
    gffread = subprocess.Popen(cmd, cwd=wd, shell=True, stdout=log, stderr=err)
    gffread.communicate()
    db_gffread = gffutils.create_db(outfile_gff.name, ':memory:', merge_strategy='create_unique', keep_order=True,
                                    transform=transform_func)

    outfile_out = tempfile.NamedTemporaryFile(delete=False, prefix="additional.final.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile_out.name)

    for gene in db_gffread.features_of_type("gene"):
        for i in db_gffread.children(gene, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gffread[gene])
    return outfile_out.name
示例#14
0
def gff_filter(final_evm, myFasta):
    file_out = final_evm + ".mod.gff3"
    with open(final_evm, "r") as fh, open(file_out, "w") as fhd:
        for line in fh:
            if not line.startswith("#"):
                elm = line.split("\t")
                elm[8] = elm[8].replace("locus", "Parent")
                elm[2] = elm[2].replace("locus", "gene")
                fhd.write("\t".join(elm))
    db = gffutils.create_db(file_out, ':memory:', merge_strategy="create_unique", keep_order=True)

    b = []
    mrna_retain = []
    for t in db.features_of_type('gene', order_by='start'):
        c = 0
        for i in db.children(t, featuretype='mRNA', order_by='start'):
            c += 1
        if c > 1:
            b.append(t)
        else:
            mrna_retain.append(i.attributes["ID"][0])
    mrna_select = []
    for t in b:
        seq_multiple = []
        for a in db.children(t, featuretype='mRNA', order_by='start'):
            seq_combined = ''
            j = 0
            for i in db.children(a, featuretype='CDS', order_by='start'):
                j += 1
                if j == 1:
                    pphase = i[7]
                seq = i.sequence(myFasta, use_strand=False)
                seq_combined += seq
            seq_combined = SeqRecord(Seq(seq_combined, generic_dna))
            if t.strand == '-':
                pphase = i[7]
                seq_combined = seq_combined.reverse_complement()
            if pphase == "0" or pphase == ".":
                seq_transl = seq_combined.translate()
            elif pphase == "1":
                seq_transl = seq_combined[1:].translate()
            elif pphase == "2":
                seq_transl = seq_combined[2:].translate()
            seq_transl.id = a.attributes["ID"][0]
            if seq_multiple:
                seq_multiple_len = len(seq_multiple)
                c = 0
                while seq_multiple_len > c:
                    a1 = str(seq_multiple[c].seq)
                    a2 =      str(seq_transl.seq)
                    if a1.rstrip("*").startswith(a2.rstrip("*")) and len(a1) < len(a2):
                        seq_multiple[c] = seq_transl
                    elif  a2.rstrip("*").startswith(a1.rstrip("*")) and len(a1) < len(a2):
                        seq_multiple[c] = seq_transl
                    else:
                        seq_multiple = seq_multiple + [seq_transl]
                    c+=1
            else:
                seq_multiple = [seq_transl]
        mrna_select = mrna_select + seq_multiple
    mrna_select_name = []
    for seq in mrna_select:
        mrna_select_name.append(seq.id)
    mrna_total = sorted(mrna_retain + mrna_select_name)
    gff_file = final_evm + ".final.gff3"
    gff_out = gffwriter.GFFWriter(gff_file)
    for key in mrna_total:
        for i in db.parents(key, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db[key])
        for i in db.children(key, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        for i in db.children(key, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)
    return(gff_file)
示例#15
0
def strand(gff_file1, gff_file2, fasta, proc, gmap_wd, verbose):
    outputFilename = tempfile.NamedTemporaryFile(delete=False,
                                                 prefix="grs",
                                                 dir=gmap_wd)
    gff_out = gffwriter.GFFWriter(outputFilename.name)
    outputFilenameGmap = tempfile.NamedTemporaryFile(delete=False,
                                                     prefix="grs",
                                                     dir=gmap_wd)
    gff_out_s = gffwriter.GFFWriter(outputFilenameGmap.name)

    gt_com = GT_RETAINID % gff_file1
    file1 = tempfile.NamedTemporaryFile(delete=False,
                                        mode="w",
                                        prefix="grs",
                                        dir=gmap_wd)
    err1 = tempfile.NamedTemporaryFile(delete=False,
                                       mode="w",
                                       prefix="grs",
                                       dir=gmap_wd)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
        sys.stderr.write('Log file is: %s %s\n\n' % (file1.name, err1.name))
    gt_call = subprocess.Popen(gt_com, stdout=file1, stderr=err1, shell=True)
    gt_call.communicate()

    file2 = tempfile.NamedTemporaryFile(delete=False, mode="w")
    err1 = tempfile.NamedTemporaryFile(delete=False, mode="w")
    gt_com = GT_RETAINID % gff_file2
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=file2, stderr=err1, shell=True)
    gt_call.communicate()

    db1 = gffutils.create_db(file1.name,
                             ':memory:',
                             merge_strategy='create_unique',
                             keep_order=True)
    db2 = gffutils.create_db(file2.name,
                             ':memory:',
                             merge_strategy='create_unique',
                             keep_order=True)
    listgeneintrons = []
    listgenetotal = []
    for i in db1.features_of_type("intron"):
        g = ' '.join(i.attributes['Parent'])
        listgeneintrons.append(g)
    for i in db1.features_of_type("CDS"):
        g = ' '.join(i.attributes['Parent'])
        listgenetotal.append(g)
    listgene1 = sorted(set(list(set(listgenetotal) ^ set(listgeneintrons))))
    listgeneintrons = []
    listgenetotal = []

    for i in db2.features_of_type("intron"):
        g = ' '.join(i.attributes['Parent'])
        listgeneintrons.append(g)
    for i in db2.features_of_type("CDS"):
        g = ' '.join(i.attributes['Parent'])
        listgenetotal.append(g)
    listgene2 = sorted(set(list(set(listgenetotal) ^ set(listgeneintrons))))

    newlist = []
    gene_dict = {}
    for a in listgene2:
        b = a.split('_', 1)[1]
        bb = b.split('.')
        del bb[-1]
        evm = '.'.join(bb)
        newlist.append(evm)
        if evm in gene_dict:
            z = gene_dict[evm]
            gene_dict[evm] = z + [a]
        else:
            gene_dict[evm] = [a]
    commonlist = list(set(listgene1).intersection(newlist))
    uniqGmap = sorted(set(list(set(newlist) ^ set(commonlist))))

    evm_list = []
    gmap_list = []
    for a in commonlist:
        if gene_dict[a] and len(gene_dict[a]) < 2:
            evm_list.append(a)
        elif gene_dict[a] and len(gene_dict[a]) > 1:
            gmap_list = gmap_list + gene_dict[a]

    for a in uniqGmap:
        if gene_dict[a]:
            gmap_list = gmap_list + gene_dict[a]
    listgeneintrons_u = (set(listgeneintrons))

    for evm in evm_list:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db1[evm])
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)

    for evm in gmap_list:
        for i in db2.children(evm, featuretype='CDS', order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db2[evm])
        for i in db2.parents(evm, featuretype='gene', order_by='start'):
            gff_out_s.write_rec(i)
        for i in db2.children(evm, featuretype='exon', order_by='start'):
            gff_out_s.write_rec(i)

    for evm in listgeneintrons_u:
        for i in db2.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db2[evm])
        for i in db2.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db2.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)
    gff_out.close()
    gff_out_s.close()

    gffOrf = longest(outputFilenameGmap.name, fasta, proc, gmap_wd, verbose)

    output_filename_final = gmap_wd + 'finalAnnotation.Final.Comb.gff3'
    gff_files = [gffOrf, outputFilename.name]

    with open(output_filename_final, 'wb') as wfd:
        for f in gff_files:
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)

    return output_filename_final
示例#16
0
def removeDiscrepancy(gff, evmFile, verbose):
    badName = []
    comm = PASA_VAL % gff
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % comm)
    gffVal_call = subprocess.Popen(comm, stdout=subprocess.PIPE, shell=True)
    for ln in gffVal_call.stdout.readlines():
        name = re.split(' |\.CDS', (ln.decode("utf-8")))
        if len(name) > 3 and "ERROR" in name[0]:
            badName.append(name[2])
    badNameUniq = list(set(badName))
    i = open(gff, 'r')
    listAllName = []
    for line in i:
        fields = line.strip().split('\t')
        if len(fields) > 3:
            if "mRNA" in fields[2]:
                attribute = fields[8].split(';')
                for el in attribute:
                    if "ID" in el:
                        listAllName.append(el.split("=")[1])
    listgene = sorted(set(list(set(listAllName) ^ set(badNameUniq))))
    outputFilename = gff + '.noProblem.gff3'
    gff_out = gffwriter.GFFWriter(outputFilename)
    db1 = gffutils.create_db(gff,
                             ':memory:',
                             merge_strategy='create_unique',
                             keep_order=True)
    for evm in listgene:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db1[evm])
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)
    cmd = BEDTOOLS_INTERSECT % (evmFile, gff)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % cmd)
    bedtools_call = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    evm_mRNA = []
    for ln in bedtools_call.stdout.readlines():
        lne = ln.decode("utf-8")
        ln = lne
        if "mRNA" in ln.split('\t')[2]:
            attribute = ln.split('\t')[8].split(';')
            for el in attribute:
                if "ID" in el:
                    mRNA = el.split('=')[1]
                    evm_mRNA.append(mRNA)
    db1 = gffutils.create_db(evmFile,
                             ':memory:',
                             merge_strategy='create_unique',
                             keep_order=True)
    for evm in evm_mRNA:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db1[evm])
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)

    return outputFilename
示例#17
0
def removeOverlap(gff, verbose):
    i = open(gff, 'r')
    #outFile = gff + '.RNA.gff'
    o = tempfile.NamedTemporaryFile(delete=False,
                                    mode='w')  #open(outFile, 'w')
    for line in i:
        listLine = line.split('\t')
        if len(listLine) == 9:
            if "CDS" in listLine[2]:
                o.write(line)
    i.close()
    bedouffile = tempfile.NamedTemporaryFile()
    #errorFile = outFile + ".bedtools_err.log"
    errorFilefile = tempfile.NamedTemporaryFile()  #open(errorFile, "w")
    bedsort = BEDTOOLS_SORT % o.name
    bedmerge = BEDTOOLS_MERGE
    o.close()
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % bedsort)
        sys.stderr.write('Executing: %s\n\n' % bedmerge)
    bedsort_call = subprocess.Popen(bedsort,
                                    stdout=subprocess.PIPE,
                                    stderr=errorFilefile,
                                    shell=True)
    bedmerge_call = subprocess.Popen(bedmerge,
                                     stdin=bedsort_call.stdout,
                                     stdout=bedouffile,
                                     stderr=errorFilefile,
                                     shell=True)
    bedmerge_call.communicate()
    errorFilefile.close()
    listMultiple = []
    listUniq = []
    count = 0
    dictRNA = {}
    i = open(bedouffile.name, 'r')
    for a in i:
        listLine = a.split('\t')
        nameRNA = re.split(',|;', listLine[5])
        count += 1
        locus = "locus" + str(count)
        for elm in nameRNA:
            if "Parent" in elm and int(listLine[4]) > 1:
                mRNAname = elm.split('=')[1]
                listMultiple.append(mRNAname)
                if locus in dictRNA:
                    dictRNA[locus].append(mRNAname)
                else:
                    dictRNA[locus] = [
                        mRNAname,
                    ]
            elif "Parent" in elm:
                mRNAname = elm.split('=')[1]
                listUniq.append(mRNAname)
    bedouffile.close()
    listMultipleUniq = []
    listMultipleUniq = list(set(listMultiple))
    dictLength = {}
    mRNA = open(gff, 'r')
    for line in mRNA:
        listLine = line.split('\t')
        if len(listLine) == 9:
            if "CDS" in listLine[2]:
                for key in dictRNA:
                    for el in dictRNA[key]:
                        nameID = "Parent=" + el + ';'
                        if nameID in line:
                            length = (int(line.split('\t')[4]) -
                                      int(line.split('\t')[3]))
                            if key in dictLength:
                                oldLenght = dictLength[key]
                                if int(oldLenght[1]) < int(length):
                                    dictLength[key] = [el, str(length)]
                            else:
                                dictLength[key] = [el, str(length)]
    for key in dictLength:
        listUniq.append(dictLength[key][0])
    listUniqNew = []
    for mRNA in listUniq:
        mRNAnew = mRNA.strip('\n')
        if mRNAnew not in listUniqNew:
            listUniqNew.append(mRNAnew)
    outputFilename = gff + '.uniq.gff3'
    gff_out = gffwriter.GFFWriter(outputFilename)
    db1 = gffutils.create_db(gff,
                             ':memory:',
                             merge_strategy='create_unique',
                             keep_order=True)
    for evm in listUniqNew:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        gff_out.write_rec(db1[evm])
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            gff_out.write_rec(i)

    return outputFilename
def annotate_gff_with_genes(args):
    """
    Annotate GFF with genes table.
    """
    gff_fname = utils.pathify(args.gff_filename)
    if not os.path.isfile(gff_fname):
        raise Exception, "Cannot find %s" % (gff_fname)
    table_fname = utils.pathify(args.table_filename)
    if not os.path.isfile(table_fname):
        raise Exception, "Cannot find %s" % (table_fname)
    table_bed = get_table_as_bedtool(table_fname)
    # Get BedTool for events, containing only the gene entries
    all_events_bed = pybedtools.BedTool(gff_fname)
    event_genes = \
        all_events_bed.filter(lambda entry: entry.fields[2] == "gene")
    print "Determining overlap between events and genes..."
    # Intersect event genes with gene txStart/txEnd
    intersected_bed = \
        event_genes.intersect(table_bed, wb=True, s=True, f=1)
    # Map event genes to their IDs
    #
    #  event_gene1 -> refseq  -> value
    #              -> ensgene -> value
    #  event_gene2 -> refseq  ->
    #  ...
    event_genes_to_info = \
        defaultdict(lambda: defaultdict(list))
    for entry in intersected_bed:
        event_gene_attrs = utils.parse_attributes(entry.fields[8])
        event_gene_str = event_gene_attrs["ID"]
        gene_info_field = entry.fields[-1]
        # Strip semicolon of ID attributes
        if gene_info_field.endswith(";"):
            gene_info_field = gene_info_field[0:-1]
        # Convert attributes into dictionary
        gene_info = utils.parse_attributes(gene_info_field)
        ensgene_id = gene_info["ensg_id"]
        refseq_id = gene_info["refseq_id"]
        gene_symbol = gene_info["gsymbol"]
        # Skip null entries
        if not is_null_id(ensgene_id):
            event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id)
        if not is_null_id(refseq_id):
            event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id)
        if not is_null_id(gene_symbol):
            event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol)
    # Incorporate the gene information into the GFF and output it
    # it using gffutils
    print "Loading events into GFF database..."
    events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False)
    output_fname = gff_fname
    events_out = gffwriter.GFFWriter(output_fname, in_place=True)
    print " - Outputting annotated GFF to: %s" % (output_fname)

    def new_recs():
        for gene_recs in list(events_db.iter_by_parent_childs()):
            gene_rec = gene_recs[0]
            event_id = gene_rec.id
            # Use existing IDs if present
            if "ensgene_id" in gene_rec.attributes:
                ensgene_id = gene_rec.attributes["ensg_id"][0]
            else:
                ensgene_id = "NA"
            if "refseq_id" in gene_rec.attributes:
                refseq_id = gene_rec.attributes["refseq_id"][0]
            else:
                refseq_id = "NA"
            if "gene_symbol" in gene_rec.attributes:
                gene_symbol = gene_rec.attributes["gsymbol"][0]
            else:
                gene_symbol = "NA"
            if event_id in event_genes_to_info:
                event_info = event_genes_to_info[event_id]
                ensgene_ids = \
                    utils.unique_list(event_info["ensg_id"])
                if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA":
                    ensgene_id = ",".join(ensgene_ids)
                refseq_ids = \
                    utils.unique_list(event_info["refseq_id"])
                if len(refseq_ids) > 0 and refseq_ids[0] != "NA":
                    refseq_id = ",".join(refseq_ids)
                gene_symbols = \
                    utils.unique_list(event_info["gsymbol"])
                if len(gene_symbols) > 0 and gene_symbols[0] != "NA":
                    gene_symbol = ",".join(gene_symbols)
            gene_rec.attributes["ensg_id"] = [ensgene_id]
            gene_rec.attributes["refseq_id"] = [refseq_id]
            gene_rec.attributes["gsymbol"] = [gene_symbol]
            # Yield all the gene's records
            for g in gene_recs:
                yield g

    t1 = time.time()
    print "Creating annotated GFF database..."
    annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False)
    t2 = time.time()
    print "Creation took %.2f secs" % (t2 - t1)
    # Write to file
    print "Writing annotated GFF to file..."
    for gene_rec in annotated_db.all_features(featuretype="gene"):
        events_out.write_gene_recs(annotated_db, gene_rec.id)
    events_out.close()