Python dumpGzipPickle示例，utility.io_util.dumpGzipPickle Python示例

示例#1

0

显示文件

def gather_and_stitch(seq, tfile):

    total_data = []
    in_file = str(seq) + "_tophits.gzip"

    if os.path.isfile(in_file):
        print "Extracting data from non_refined file:", in_file
        tasks = io.readGzipPickle(in_file)
        for entry in tasks:
            total_data.append(entry)
    else:
        print "Something is wrong in the datafile: ", in_file

    refined_data = combine_data()
    if refined_data:
        for entry in refined_data:
            total_data.append(entry)
    else:
        pass

    from ranking.NoeStageRank import rank_assembly_with_clustering
    ranked_data = rank_assembly_with_clustering(total_data, args.numhits)
    io.dumpGzipPickle(str(tfile), ranked_data)

    # delete files

    try:
        print "Deleting old rtx_* files!"
        rm_files = "rm rtx_*.gzip"
        os.system(rm_files)
    except NameError:
        print "No rtx_* files exist!"
    return True

示例#2

0

显示文件

def altSmotifSearch(job):

    # send_job = [tasks[t_job[0]], alt_sse_profile[t_job[1]], args.stage, task_index, lowest_noe_energy]
    all_log = []
    task = (job[0])[:]
    refine_pair = task[8][1]
    index_array = job[3]
    print "task_index", index_array
    for pair in refine_pair:
        tdump_log = perform_alt_search(job, pair)
        if tdump_log:
            for t in tdump_log:
                all_log.append(t)

    # Dump data to the disk
    if all_log:
        io.dumpGzipPickle("rtx_" + str(index_array) + ".gzip", all_log)
        return False
    else:
        return False

示例#3

0

显示文件

def makeTopPickle2Old(previous_smotif_index, num_hits, stage):
    """
    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)

    for hit in hits:
        # thread_data contains data from each search and filter thread.

        if hit[4][0] == 'NOE_filter':
            no_of_noes = hit[4][2]
            new_dict[no_of_noes].append(hit)

    keys = new_dict.keys()
    keys.sort()
    keys.reverse()
    # Rank based on NOE energy
    non_redundant = collections.defaultdict(list)
    reduced_dump_log = []
    seqs = []
    count_hits = 0
    for i in range(len(keys)):
        entries = new_dict[keys[i]]
        if len(
                entries
        ) == 1:  # There is only one entry in this no_of_noes bin just check of existing sequences and move on
            smotif_seq = entries[0][3][1]
            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                reduced_dump_log.append(entries[0])
                print "final sele", entries[0][0][1][0][0], keys[i]
                count_hits += 1
        else:
            t_log = collections.defaultdict(list)
            for hit in entries:  # filter on noe_energy
                if hit[4][0] == 'NOE_filter':
                    noe_energy = hit[4][3]
                    noe_energy = round(noe_energy, 2)
                    t_log[noe_energy].append(hit)
            noe_energy_bins = t_log.keys()
            noe_energy_bins.sort()

            for j in range(len(noe_energy_bins)):  # filter on RDC score
                t2_log = collections.defaultdict(list)
                hits = t_log[noe_energy_bins[j]]
                for hit in hits:
                    if hit[5][0] == 'RDC_filter':
                        rdc_tensors = hit[5][1]
                        rdc_score = 0
                        for tensor in rdc_tensors:
                            rdc_score = rdc_score + tensor[0]
                        t2_log[rdc_score].append(hit)
                rdc_score_bins = t2_log.keys()
                rdc_score_bins.sort()
                for k in range(len(rdc_score_bins)):
                    hits = t2_log[rdc_score_bins[k]]
                    for hit in hits:
                        smotif_seq = hit[3][1]
                        if smotif_seq not in seqs:
                            seqs.append(smotif_seq)
                            reduced_dump_log.append(hit)
                            print "final sele", hit[0][1][0][0], keys[
                                i], noe_energy_bins[j], rdc_score_bins[k]
                            count_hits += 1
                if count_hits >= num_hits:
                    break
        if count_hits >= num_hits:
            break
    if count_hits >= num_hits:
        pass
    else:
        print "could only extract ", len(reduced_dump_log), count_hits

    # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle)
    io.dumpGzipPickle(
        str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log)
    print "actual number in top hits ", len(reduced_dump_log)
    return range(len(reduced_dump_log))

示例#4

0

显示文件

def makeTopPickle2(previous_smotif_index, num_hits, stage):
    """
    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)

    for hit in hits:
        # thread_data contains data from each search and filter thread.
        noe_energy = hit[5][3]
        noe_energy = round(noe_energy, 4)
        new_dict[noe_energy].append(hit)
    keys = new_dict.keys()
    keys.sort()
    # Rank based on NOE energy
    reduced_dump_log = []
    seqs = []
    count_hits = 0
    for i in range(len(keys)):
        entries = new_dict[keys[i]]
        if count_hits >= num_hits:
            break
        if len(entries) == 1:
            smotif_seq = entries[0][4][1]
            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                reduced_dump_log.append(entries[0])
                print "final sele", entries[0][0][1][0][0], keys[i]
                count_hits += 1
                if count_hits >= num_hits:
                    break
        else:
            t2_log = collections.defaultdict(list)
            if hit[6][0] == 'RDC_filter':
                for hit in entries:
                    # if hit[5][0] == 'RDC_filter':
                    rdc_tensors = hit[6][1]
                    rdc_score = 0
                    for tensor in rdc_tensors:
                        rdc_score = rdc_score + tensor[0]
                    t2_log[rdc_score].append(hit)
                rdc_score_bins = t2_log.keys()
                rdc_score_bins.sort()
            elif hit[6][0] == 'PCS_filter':
                print "Working on PCS filter instead of RDC"
                for hit in entries:
                    # if hit[5][0] == 'RDC_filter':
                    rdc_tensors = hit[6][1]
                    rdc_score = 0
                    for tensor in rdc_tensors:
                        rdc_score = rdc_score + tensor[1]
                    t2_log[rdc_score].append(hit)
                rdc_score_bins = t2_log.keys()
                rdc_score_bins.sort()
            else:
                print "Something is wrong with your PCS logic"

            for k in range(len(rdc_score_bins)):
                hits = t2_log[rdc_score_bins[k]]
                for hit in hits:
                    smotif_seq = hit[4][1]
                    if smotif_seq not in seqs:
                        seqs.append(smotif_seq)
                        reduced_dump_log.append(hit)
                        count_hits += 1
                        print "final sele", hit[0][1][0][0], keys[
                            i], rdc_score_bins[k]
                    if count_hits >= num_hits:
                        break
                if count_hits >= num_hits:
                    break
            if count_hits >= num_hits:
                break
            else:
                pass
    if count_hits >= num_hits:
        pass
    else:
        print "could only extract ", len(reduced_dump_log), count_hits

    io.dumpGzipPickle(
        str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log)
    print "actual number in top hits ", len(reduced_dump_log)
    return range(len(reduced_dump_log))

示例#5

0

显示文件

def makeTopPickle(previous_smotif_index, num_hits, stage):
    """
    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)
    rdc_constant = 0.0
    for hit in hits:
        # thread_data contains data from each search and filter thread.
        # initialize total score array
        total_score = {}
        for data_filter in hit:

            if data_filter[0] == 'PCS_filter':
                pcs_data = data_filter
                pcsscore = getNchiSum(pcs_data, stage)
                total_score['pcs_score'] = pcsscore

            if data_filter[0] == 'Ref_RMSD':
                total_score['rmsd_score'] = data_filter[1]

            if data_filter[0] == 'RDC_filter':
                rdc_data = data_filter
                #Nchi = rdcSumChi(rdc_data, stage)
                log_likelihood = data_filter[2]
                rdc_tensors = data_filter[1]
                for tensor in rdc_tensors:
                    rdc_constant = rdc_constant + tensor[0]
                rdc_constant = rdc_constant * 1e-10
                total_score['rdc_score'] = log_likelihood

            if data_filter[0] == 'NOE_filter':
                noe_probability = data_filter[1]
                log_likelihood = -1 * (math.log(noe_probability))
                total_score['noe_score'] = log_likelihood

                # calculate the total score and append the hit
        if total_score:
            keys = total_score.keys()
            keys = ['noe_score', 'rdc_score']
            #keys = ['rmsd_score']
            tscore = 0
            for key in keys:
                tscore = tscore + total_score[key]
            tscore = tscore + rdc_constant
            if tscore < 999.999:
                new_dict[tscore].append(hit)

    # ************************************************
    # Exclude the redundant entries and rank top hits
    # ************************************************

    keys = new_dict.keys()
    keys.sort()

    # Exclude the redundant data.

    # non_redundant = {}
    non_redundant = collections.defaultdict(list)
    seqs = []
    smotif_seq = ''
    count_hits = 0
    for i in range(0, len(keys)):
        entries = new_dict[keys[i]]
        for entry in entries:
            for ent in entry:
                if ent[0] == 'seq_filter':
                    seq_filter = ent
                    smotif_seq = seq_filter[1]
            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                non_redundant[keys[i]].append(entry)
                count_hits += 1
        if count_hits >= num_hits:
            break

    # Rank top hits and dump the data
    keys = non_redundant.keys()
    keys.sort()

    dump_pickle = []
    print "Dumping data to disk"
    count_top_hits = 0
    while (True):
        for key in keys:
            entries = non_redundant[key]
            for entry in entries:
                dump_pickle.append(entry)
                # print "final selected Smotif: ", entry[0][1][0][0], "with score: ", key
                print "final sele", entry[0][1][0][0], key
                count_top_hits += 1
            if count_top_hits >= num_hits:
                break
        if count_top_hits >= num_hits:
            break
        else:
            print "could only extract ", count_top_hits
            break

    # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle)
    io.dumpGzipPickle(
        str(previous_smotif_index) + "_tophits.gzip", dump_pickle)
    print "actual number in top hits ", len(dump_pickle)
    return range(count_top_hits)

示例#6

0

显示文件

                tasks), "Smotifs, Elapsed", round((elapsed) / (60), 2), "mins"
        elif tag == tags.EXIT:
            closed_workers += 1

    # consolidate top_hits and dump files here
    if args.stage == 1:
        tasks, sse_index = srank.getRunSeq(
            args.numhits, args.stage)  # TODO change this for new alt_smotifs
        exit()
    print "Total number of hits  found are : ", len(total_data)
    # ranked_data = rank_assembly(total_data, args.numhits)
    ranked_data = rank_assembly_with_clustering_pcs2(total_data, args.numhits)
    print len(ranked_data)
    if args.stage == 1:
        sse_index = 0
    io.dumpGzipPickle(str(sse_index) + "_tophits.gzip", ranked_data)
    # Rename temprary files
    util.rename_pickle(sse_index)
    print "All Done, Master exiting"
    exit()

# On the worker processes
else:

    while True:  # initiate infinite loop
        comm.send(None, dest=0, tag=tags.READY)
        # Signal the master process that you are READY

        task = comm.recv(source=0, tag=MPI.ANY_SOURCE, status=status)
        tag = status.Get_tag()
        if tag == tags.START:

示例#7

0

显示文件

def S1SmotifSearch(task):
    """
    Main ()
    :param task:
    :return:
    """

    index_array = task[0]
    stage = task[1]
    s1_def, s2_def, sse_route = mutil.getSSdef(index_array)
    smotif_def = sm.getSmotif(s1_def, s2_def)
    exp_data = io.readPickle("exp_data.pickle")
    exp_data_types = exp_data.keys(
    )  # ['ss_seq', 'pcs_data', 'aa_seq', 'contacts']

    smotif_data = sm.readSmotifDatabase(smotif_def,
                                        exp_data['database_cutoff'])

    if not smotif_data:
        # If the smotif library doesn't exist, terminate further execution.
        return False

    dump_log = []

    # ************************************************************************************************
    # Main
    # The 'for' loop below iterates over all of the Smotifs and applies various filters
    # This is the place to add new filters as you desire. For starters, look at Sequence filter.
    # ************************************************************************************************

    for i in range(0, len(smotif_data)):

        # ************************************************
        # Excluding the natives
        # ************************************************

        natives = exp_data['natives']
        tpdbid = smotif_data[i][0][0]
        pdbid = tpdbid[0:4]

        if 'natives' in exp_data_types:
            if pdbid in natives:
                continue
                # Stop further execution, but, iterate.
            else:
                pass

        if 'homologs' in exp_data_types:  # Smotif assembly only from the specified pdb files
            homologs = exp_data['homologs']
            if pdbid not in homologs:
                # Stop further execution, but, iterate.
                continue
            else:
                pass

        # ************************************************
        # Applying different filters to Smotifs
        # Prepare temp log array to save data at the end
        # ************************************************

        tlog, pcs_tensor_fits, rdc_tensor_fits, = [], [], []
        ref_rmsd, noe_probability = 0.0, 0.0

        tlog.append(['smotif', smotif_data[i], sse_route])
        tlog.append(['smotif_def', [s1_def, s2_def]])
        tlog.append(['qcp_rmsd'])
        tlog.append(['cathcodes', [smotif_data[i][0]], [sse_route]])

        # ************************************************
        # Sequence filter
        # Aligns the smotif seq to target seq and calculates
        # sequence identity and the alignment score
        # ************************************************

        smotif_seq, seq_identity = Sfilter.getS1SeqIdentity(
            s1_def, s2_def, smotif_data[i], exp_data)
        tlog.append(['seq_filter', smotif_seq, seq_identity])

        # ************************************************
        # Unambiguous NOE score filter
        # uses experimental ambiguous noe data to filter Smotifs
        # scoring based on f-measure?
        # ************************************************

        if 'ilva_noes' in exp_data_types:

            noe_probability, no_of_noes, noe_energy, noe_data, cluster_protons, cluster_sidechains = noepdf.s1ILVApdf(
                s1_def, s2_def, smotif_data[i], exp_data, stage)

            if noe_probability >= exp_data['expected_noe_prob'][stage - 1]:
                tlog.append([
                    'NOE_filter', noe_probability, no_of_noes, noe_energy,
                    noe_data, cluster_protons, cluster_sidechains
                ])
            else:
                continue

        # ************************************************
        # Residual dipolar coupling filter
        # uses experimental RDC data to filter Smotifs
        # scoring based on normalised chisqr.
        # ************************************************

        if 'rdc_data' in exp_data_types:
            rdc_tensor_fits, log_likelihood, rdc_energy = Rfilter.RDCAxRhFit(
                s1_def, s2_def, smotif_data[i], exp_data)
            if rdc_tensor_fits:
                tlog.append([
                    'RDC_filter', rdc_tensor_fits, log_likelihood, rdc_energy
                ])
            else:
                continue

        # ************************************************
        # Pseudocontact Shift filter
        # uses experimental PCS data to filter Smotifs
        # scoring based on normalised chisqr
        # ************************************************

        if 'pcs_data' in exp_data_types:
            pcs_tensor_fits = Pfilter.PCSAxRhFit(s1_def, s2_def,
                                                 smotif_data[i], exp_data)
            tlog.append(['PCS_filter', pcs_tensor_fits])

        # ************************************************
        # Calc RMSD of the reference structure.
        # Used to identify the lowest possible RMSD
        # structure for the target, from the Smotif library.
        # ************************************************

        if 'reference_ca' in exp_data_types:
            ref_rmsd = ref.calcRefRMSD(exp_data['reference_ca'],
                                       s1_def,
                                       s2_def,
                                       smotif_data[i],
                                       rmsd_cutoff=100.0)
            tlog.append(['Ref_RMSD', ref_rmsd, seq_identity])

        # Dump the data to the disk
        if pcs_tensor_fits or noe_probability:
            dump_log.append(tlog)

    # Save all of the hits in pickled arrays
    if dump_log:
        if 'rank_top_hits' in exp_data_types:
            rank_top_hits = exp_data['rank_top_hits']
            num_hits = rank_top_hits[stage - 1]
            dump_log = rank.rank_assembly(dump_log, num_hits)
            print "Reducing the amount of data to:", rank_top_hits[
                stage - 1], len(dump_log)
        print "num of hits", len(dump_log)
        io.dumpGzipPickle(
            '0_' + str(index_array[0]) + "_" + str(index_array[1]) + ".gzip",
            dump_log)
        return dump_log
    else:
        return False