def fold_groups(seqs, struct, hold, numkept=None):
    '''Function for multithreading.
    Recomputes structure for a group'''
    try:
        aln, currstruct = bayesfold(seqs)
        if currstruct == "":
            currstruct = struct
        hold[currstruct] = structgroups[struct]
    except Exception, e:
        print str(e)
        stdout.flush()
示例#2
0
def fold_clusters(lock, cluster, seqs, otufolder):
    '''Function for multithreading.
    Computes structure for a cluster and writes it to file'''
    aln, struct = bayesfold(seqs, params={"-diags": True})
    gshape = get_shape(struct)
    #write structure out to file
    lock.acquire()
    cfo = open(otufolder + "cluster_structs.fasta", 'a')
    cfo.write(">" + cluster + " " + gshape + "\n" + struct + "\n")
    cfo.close()
    #print cluster + ": " + struct
    #stdout.flush()
    lock.release()
示例#3
0
def fold_clusters(lock, cluster, seqs, otufile):
    '''Function for multithreading.
    Computes structure for a cluster and writes it to file'''
    aln, struct = bayesfold(seqs, params={"-diags": True})
    # write structure out to file
    try:
        lock.acquire()
        cfo = open(otufile, 'a')
        cfo.write(">%s\n%s\n" % (cluster, struct))
        cfo.write(aln.toFasta() + "\n")
        cfo.close()
        lock.release()
    except Exception:
        lock.release()
示例#4
0
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1):
    '''Function for multithreading. Creates the final BayesFold alignment and 
    writes to files, then r2r struct'''
    try:
        #run locana-p on the superclusters to get alignment and structure
        #skip if already run and program just crashed or whatever
        currotufolder = basefolder + "group_" + str(currgroup)
        if exists(currotufolder):
            return ""
        seqs = []
        count = 0
        out = "group " + str(currgroup) + ": "
        for header, seq in MinimalFastaParser(open(groupfasta, 'rU')):
            seqs.append((header.split()[0] + "_" + header.split("_")[1], seq))
            count += int(header.split("_")[1])
        out += "\n" + str(count) + " sequences\n"
        if count < minseqs:
            return ""
        stdout.flush()
        #hard limit of 500 sequences to align and fold for memory reasons
        if len(seqs) > 500:
            seqs = seqs[:500]
        #run BayesFold on sequences in the group
        #maxiters set to 5 because should have huge amount of sequences for some groups
        aln, struct = bayesfold(seqs, params={"-diags": True})
        #create output folder for group
        mkdir(currotufolder)
        out += str(aln.getNumSeqs()) + " unique sequences\n"
        out += "Structure: " + struct + "\n"
        #write out alignment and structure in fasta and stockholm formats
        #write that shit
        logout = open(currotufolder + "/log.txt", 'w')
        logout.write(out)
        logout.close()
        alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w')
        alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
        alnout.close()
        alnout = open(currotufolder + "/bayesfold-aln.sto", 'w')
        struct_dict = {'SS_cons': struct}
        alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
        alnout.close()
        #make R2R secondary structure for alignment
        make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup))
    except Exception, e:
        print str(e)
        stdout.flush()
def fold_clusters(lock, cluster, seqs, otufolder):
    '''Function for multithreading.
    Computes structure for a cluster and writes it to file'''
    try:
        aln, struct = bayesfold(seqs)
        #write structure out to file
        lock.acquire()
        cfo = open(otufolder + "cluster_structs.fasta", 'a')
        cfo.write(">" + cluster + "\n" + struct + "\n")
        cfo.close()
        lock.release()
        #print cluster + ": " + struct
        #stdout.flush()
    except Exception, e:
        cluster, struct, "\nERROR!"
        stdout.flush()
        lock.release()
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1):
    '''Function for multithreading
    creates the final BayesFold alignment and writes to files, then r2r struct'''
    try:
        #run locana-p on the superclusters to get the alignment and consensus structure
        #skip if already run and program just crashsed or whatever
        currotufolder = basefolder + "group_" + str(currgroup)
        if exists(currotufolder):
            return ""
        seqs = []
        count = 0
        out = "group " + str(currgroup) + ": "
        for header, seq in MinimalFastaParser(open(groupfasta, 'rU')):
            seqs.append((header.split()[0] + "_" + header.split("_")[1], seq))
            count += int(header.split("_")[1])
        out += "\n" + str(count) + " sequences\n"
        if count < minseqs:
            return ""
        #make sure group has enough sequences before continuing
        #run BayesFold on the at most 50 most abundant sequences in the group
        aln, struct = bayesfold(seqs)
        #create output folder for group
        mkdir(currotufolder)
        out += str(aln.getNumSeqs()) + " unique sequences\n"
        out += "Structure: " + struct + "\n"
        #write out alignment and structure in fasta and stockholm formats
        #write that shit
        logout = open(currotufolder + "/log.txt", 'w')
        logout.write(out)
        logout.close()
        alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w')
        alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n")
        alnout.close()
        alnout = open(currotufolder + "/bayesfold-aln.sto", 'w')
        struct_dict = {'SS_cons': struct}
        alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict))
        alnout.close()
        #make R2R secondary structure for alignment
        make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup))
    except Exception, e:
        print str(e)
        stdout.flush()
def fold_clusters(lock, cluster, seqs, otufolder):
    '''Function for multithreading.
    Computes structure for a cluster and writes it to file'''
    #assuming that the fasta has 10 or more sequences in it. Safe assumption
    #if this is a significant cluster
    #only using 10 because this is initial structure calc so needs to be fast
    #and this gives a very good approximation for the initial rnaforester grouping
    try:
        aln, struct = bayesfold(seqs)
        #write structure out to file
        lock.acquire()
        cfo = open(otufolder + "cluster_structs.fasta", 'a')
        cfo.write(">" + cluster + "\n" + struct + "\n")
        cfo.close()
        lock.release()
        #print cluster + ": " + struct
        #stdout.flush()
    except Exception, e:
        cluster, struct, "\nERROR!"
        stdout.flush()
        lock.release()
示例#8
0
def create_final_output(groupfasta, basefolder, minseqs=1, cpus=1):
    '''Function for multithreading. Creates the final BayesFold alignment and
    writes to files, then r2r struct and infernal CM file'''
    # skip if already run and program just crashed or whatever
    currgroup = groupfasta.split("/")[-1].split(".")[0]
    currotufolder = basefolder + currgroup
    if exists(currotufolder):
        return

    # load seqs and make sure we have enough
    aln = LoadSeqs(groupfasta, moltype=RNA, aligned=True)
    count = count_seqs(aln.Names)
    if count < minseqs:
        return
    # get weights for each sequence. weight==count
    weights = []
    maxweight = 0
    for header in aln.Names:
        weight = count_seqs(header)
        if weight > maxweight:
            maxweight = weight
        weights.append(header.split()[0])
        weights.append(str(weight))

    # fold alignment with bayesfold
    aln, struct = bayesfold(aln, align=False)

    # write log information
    mkdir(currotufolder)
    with open(currotufolder + "/log.txt", 'w') as logout:
        logout.write(' '.join([
            currgroup, ":\n",
            str(count), "sequences\n",
            str(aln.getNumSeqs()), "unique sequences\nStructure: ", struct,
            "\n"
        ]))
    # write out alignment and structure in fasta format
    with open(currotufolder + "/bayesfold-aln.fasta", 'w') as alnout:
        alnout.write(">SS_cons\n%s\n%s" % (struct, aln.toFasta()))

    # shave off info in header for stockholm
    aln = LoadSeqs(data=aln, moltype=RNA, label_to_name=lambda x: x.split()[0])
    # create stockholm formatted alignment
    sto = stockholm_from_alignment(aln, GC_annotation={'SS_cons': struct})
    del aln
    # create standard weights for infernal
    infweights = ""
    for pos in range(0, len(weights), 2):
        infweights = ''.join([
            infweights,
            '# =GS %s WT %s\n' %
            (weights[pos], str(float(weights[pos + 1]) / maxweight))
        ])
    # create weights for r2r
    r2r_weights = "# =GF USE_THIS_WEIGHT_MAP " + ' '.join(weights)
    # create sto file with r2r and std weights
    sto = sto.split("\n")
    sto[-1] = infweights.strip()
    sto.append(r2r_weights)
    sto.append("//\n")
    stofile = currotufolder + "/bayesfold-aln.sto"
    with open(stofile, 'w') as alnout:
        alnout.write('\n'.join(sto))

    # make R2R secondary structure for alignment
    make_r2r(stofile, currotufolder, currgroup)
    # create CM file for infernal from group
    cmbuild_from_file(stofile,
                      currotufolder + "/cmfile.cm",
                      params={'--wgiven': True})
    calibrate_cmfile(currotufolder + "/cmfile.cm", cpus=cpus)