Exemplo n.º 1
0
def predict_position(params, recalculate, mutation, use_neighbor):

    import wc
    import objects

    #use_neighbor = False

    protein_name = mutation[0]
    pos = mutation[1]
    params.set_param('uniprot_id', protein_name)
    seq = wc.get_stuff(objects.dW, params, recalculate, False, False)
    msa = wc.get_stuff(objects.agW, params, recalculate, False, False)

    if use_neighbor:
    
        neighbors = wc.get_stuff(objects.neighbors_w, params, recalculate, False, False)
        try:
            msa = filter_msa_based_on_pos_neighbors_and_query(seq, pos, msa, neighbors[pos])
        except:
            x = 2
            pdb.set_trace()
    weights = get_weight_of_msa_seqs(msa)

    res = mutation[3]

    score = 0

    col = msa.get_column(pos)
    for i in range(len(msa)):
        if col[i] == res:
            score += weights[i]

    return score / sum(weights)
Exemplo n.º 2
0
def predict_position_energy(params, recalculate, mutation, use_neighbor, ignore_pos):

    import wc
    import objects

    #use_neighbor = False

    protein_name = mutation[0]
    pos = mutation[1]
    wild_res = mutation[2]
    mut_res = mutation[3]
    params.set_param('uniprot_id', protein_name)
    seq = wc.get_stuff(objects.dW, params, recalculate, False, False)
    msa = wc.get_stuff(objects.agW, params, recalculate, False, False)

    score = 0
    
    col = msa.get_column(pos)

    if not ignore_pos:
        try:
            score += -1.0 * math.log( float(col.count(mut_res)+1) / (col.count(wild_res)+1) )
            #score = -1.0 * ( float(col.count(mut_res)+1) / (col.count(wild_res)+1) )
        except:
            pdb.set_trace()
            x=2
        print >> sys.stderr, score
    if col.count(mut_res) == 0:
            return -1.0 * score
    
    
    if use_neighbor:
        constraints_a = [(pos,wild_res)]
        filter_a_msa = filter_msa_based_on_pos_constraint(msa, constraints_a)
        constraints_b = [(pos,mut_res)]
        filter_b_msa = filter_msa_based_on_pos_constraint(msa, constraints_b)
        all_neighbors = wc.get_stuff(objects.neighbors_w, params, recalculate, False, False)
        neighbors = all_neighbors[pos]
        for neighbor in neighbors[0:1]:
            try:
                na_col = filter_a_msa.get_column(neighbor)
                nb_col = filter_b_msa.get_column(neighbor)
            except:
                pdb.set_trace()
                x=2
            na_col_no_skip = [x for x in na_col if x != '-']
            nb_col_no_skip = [x for x in nb_col if x != '-']
            score += get_KL_real(nb_col_no_skip, na_col_no_skip)
            if abs(score) < .001:
                pdb.set_trace()
                x=2

    return -1.0 * score
Exemplo n.º 3
0
def get(obj, p, gotten_stuff, used_ps, check = True):

    global past
    used_ps.add(p.get_copy())
    print >> sys.stderr, 'starting: ', p.get_param('uniprot_id'), obj, which_job, total_jobs
    to_get = True

    gotten_stuff.append([obj, p.get_copy()])

    global whether_to_get_anything
    if whether_to_get_anything and not wc.get_wrapper_instance(obj).has(p, False, whether_to_check_remote):
        ans = wc.get_stuff(obj, p, False, False, False)

        print >> sys.stderr, 'took: ', datetime.datetime.now() - past
        past = datetime.datetime.now()
            
        return ans


    print >> sys.stderr, 'already have: ', p.get_param('uniprot_id'), obj
Exemplo n.º 4
0
 def human_classify(self, record):
     import wc
     import param
     p = param.param({'pid':record.pid, 'rec_idx':record.idx})
     stored_qa = wc.get_stuff(side_effect_human_input_report_labels, p)
     import quesions
     the_q = questions.urinary_incontinence
     try:
         ans = stored_qa[the_q]
     except KeyError:
         raise my_exceptions.NoFxnValueException
     else:
         if ans == 0:
             raise my_exceptions.NoFxnValueException
         else:
             if ans in [1,2]:
                 return 1
             elif ans in [3,4]:
                 return 0
             else:
                 pdb.set_trace()
                 raise
Exemplo n.º 5
0
 def data_set_from_pid_list(cls, pid_list, params):
     import wc
     import objects
     from global_stuff import get_tumor_cls, get_tumor_w
     the_data = []
     i = 0
     for pid in pid_list:
         print i, pid
         i += 1
         params.set_param('pid', pid)
         try:
             a_tumor = wc.get_stuff(get_tumor_w(), params)
             #assert len(a_tumor.attributes) == get_tumor_cls().num_attributes
         except my_exceptions.WCFailException:
             print 'failed to get ', pid
         except AssertionError:
             print 'failed to get ', pid, ' number of attributes was incorrect'
         except Exception:
             print 'failed to get ', pid, ' not sure of error'
         else:
             the_data.append(a_tumor)
     return cls(the_data)
            edge_list = self.get_var_or_file(objects.iW, params, False, False, False)
            helper.write_mat(edge_list, the_folder + 'edge_list.csv')

            num_nodes = len(node_features)
            adj_mat = [ [0 for i in range(num_nodes)] for j in range(num_nodes)]
            for i in range(len(edge_list)):
                n1 = edge_list[i][0]
                n2 = edge_list[i][1]
                adj_mat[n1][n2] = 1
                adj_mat[n2][n1] = 1
            helper.write_mat(adj_mat, the_folder + 'adj_mat.csv')

            info = [str(len(node_features)), str(len(edge_features)), str(2), str(len(node_features[0])), str(len(edge_features[0]))]
            helper.write_vect(info, the_folder + 'info.txt', the_sep = ' ')
            
            
        return None

#pdb_names = ['12as','2jcw','13pk','1a4i','1a4s','1ab8']
pdb_names = ['2jcw']
#chain_letters = ['A','A','A','A','A','A']
chain_letters = ['A']

from parameters import the_params

the_params.set_param('pdb_names', pdb_names)
the_params.set_param('chain_letters', chain_letters)

wc.get_stuff(generate_old_input_files,the_params, False, False, False, False)
Exemplo n.º 7
0
import pdb
import f as features
import new_new_objects as objects
import param
import wc
import global_stuff

# hardcode parameters for the experiment here for now.

#the_dict = {'pdb_name':'1asy', 'chain_letter':'A', 'edge_feature_list':[features.xW], 'node_feature_list':[features.vW, features.uW, features.wW], 'dist_cut_off':5}

#the_params = param.param(the_dict)


import helper

file_location = 'mf_nodewise_0'

folder_name, the_params = helper.read_param(file_location)

the_params.set_param('p', '1p3d')
the_params.set_param('c', 'A')
the_params.set_param('st', 322)
the_params.set_param('en', 473)


ans = wc.get_stuff(objects.ciW, the_params, False, False, False)

print ans
for line in f:
    name = line.strip()
    folder = global_stuff.base_folder + name + '/'
    files = os.listdir(folder)
    has_easy = False
    has_dist = False
    enough_rows = False
    for a_file in files:
        if 'easy' in a_file:
            has_easy = True
            subprocess.call(['cp', folder+a_file, folder+'msa'])
        if 'pairwise' in a_file:
            has_dist = True
            subprocess.call(['cp', folder+a_file, folder+'dists'])

            # copy to better file_name
    msa = wc.get_stuff(objects.agW, param.param({'uniprot_id':name, 'ev':evalue}), False, False, False)
    if len(msa) > 50:
        enough_rows = True

    if has_easy and has_dist and enough_rows:
        completed.append(name)

g = open(global_stuff.completed_list_file, 'w')
for name in completed:
    g.write(name + '\n')

f.close
g.close()
import pdb
pdb.set_trace()
import f
import new_new_objects as objects

import wrapper
from wrapper_decorator import dec
import wc
import global_stuff

import sys
info_file = sys.argv[1]

import helper
asdf, the_params = helper.read_param(info_file)


the_params.set_param('which_wrapperq', objects.fW)
wc.get_stuff(objects.abW, the_params, True, False, False)
Exemplo n.º 10
0

import sys
import wc, objects
input_file = sys.argv[1]
output_file = sys.argv[2]
use_neighbor = sys.argv[3] == 'T'
ignore_pos = sys.argv[4] == 'T'
max_neighbor = int(sys.argv[5])
num_trials = int(sys.argv[6])
pseudo_total = float(sys.argv[7])

import global_stuff
params = global_stuff.get_param()

params.set_param('protein_list_file', input_file)
l = wc.get_stuff(objects.filtered_mutation_list_given_protein_list, params)




import objects
import helper
my_output = objects.get_output_obj(params, l, use_neighbor, ignore_pos, max_neighbor, num_trials, pseudo_total, helper.vanilla_similarity, helper.normalize_nothing, helper.mutation_to_class)





helper.write_mat(my_output, output_file)
Exemplo n.º 11
0
import my_data_types
import pickle
import get_info
import global_stuff
import plotters
import numpy
import my_exceptions
import aggregate_features as af
from global_stuff import get_tumor_cls

import matplotlib.pyplot as plt


p = global_stuff.get_param()

A = set(wc.get_stuff(objects.PID_with_SS_info, p))
B = set(wc.get_stuff(objects.PID_with_shared_MRN, p))
C = set(wc.get_stuff(objects.PID_with_multiple_tumors, p))
PID_to_use = list(A - B - C)
test_PID_to_use = PID_to_use


the_data_set = helper.data_set.data_set_from_pid_list(test_PID_to_use, p)
treated_data_set = the_data_set.filter(lambda x: f.treatment_code_f().generate(x) in [1,2])
interval_boundaries = [0,0.5,1,2,5]
intervals = [my_data_types.ordered_interval(helper.my_timedelta(interval_boundaries[i]*365), helper.my_timedelta(interval_boundaries[i+1]*365)) for i in range(len(interval_boundaries)-1)]



side_effect_name = 'incontinence'
Exemplo n.º 12
0
    if i % total_jobs == which_job:


        gotten_stuff = []

        
        protein_name = line.strip()
        p.set_param('uniprot_id',protein_name)

        
        
        import wc
        import pdb

        if uniprot_or_pdb_chain == 'U':
            seq = wc.get_stuff(objects.dW,p)
        elif uniprot_or_pdb_chain == 'P':
            seq = wc.get_stuff(objects.pdb_chain_seq,p)

        print >> sys.stderr, "currently getting: ", protein_name, len(seq)
        
        if len(seq) < 1000000:




            if whether_to_temp:
                global_stuff.home = global_stuff.temp_home
                assert global_stuff.base_folder == global_stuff.real_base_folder

                try:
Exemplo n.º 13
0
import wc
import objects
import param
import pdb

p = param.param()
A = set(wc.get_stuff(objects.PID_with_SS_info, p))
B = set(wc.get_stuff(objects.PID_with_shared_MRN, p))
C = set(wc.get_stuff(objects.PID_with_several_tumors, p))

PID_to_use = A - B - C

PID_to_MRN = wc.get_stuff(objects.PID_to_MRN_dict,p)


i = 0

lengths = []

for PID in PID_to_use:

    
    p.set_param('pid',PID)
    texts = wc.get_stuff(objects.raw_medical_text,p)
    lengths.append(len(texts))
    print i, PID, len(texts)
    i += 1

pdb.set_trace()

Exemplo n.º 14
0
import global_stuff

import wc
import param
import objects
import sys

which_job = int(sys.argv[1])
total_jobs = int(sys.argv[2])

which_object = objects.pairwise_dist

f = open(global_stuff.protein_list_file, "r")

i = 0
for line in f:
    if i % total_jobs == which_job:
        protein_name = line.strip()
        wc.get_stuff(which_obj, param.param({"uniprot_id": protein_name}), True, True, False)
Exemplo n.º 15
0
import wc
import param
import objects
import global_stuff
import helper
import wrapper
import sys

name = sys.argv[1]
which_msa = int(sys.argv[2])
try:
    itera = int(sys.argv[3])
except:
    pass

p = param.param({'pdb':'1JOS', 'chain':'A', 'which_dataset':'CBS', 'uniprot_id':name, 'co':7.0, 'which_blast':0, 'which_msa':which_msa, 'ev':.05, 'blmax':999999,'hhblits_iter':itera, 'which_neighbors':1, 'protein_list_file':'rascalled_completed', 'to_leon':0, 'to_cluster':1, 'to_rascal':0, 'to_normd':0, 'norm_co':9.0, 'psiblast_iter':itera})

wc.get_stuff(wrapper.my_msa_obj_wrapper, p)

p.set_param('to_rascal', 1)

wc.get_stuff(wrapper.my_msa_obj_wrapper, p)

p.set_param('to_normd', 1)

wc.get_stuff(wrapper.my_msa_obj_wrapper, p)

Exemplo n.º 16
0
import wc
import param
import objects
p = param.param({'ev':1e-10, 'protein_list_file':'hum_var_msa_dist_completed', 'uniprot_id':'P80075', 'avg_deg':20, 'n_cutoff':0, 'f_cutoff':15})
m = wc.get_stuff(objects.pairwise_dist, p, False, False, False)
from mpi4py import MPI
import pdb
import helper
import sys

file_location = sys.argv[1]


wrappers = [objects.bhW, objects.cfW]


folder_name, the_params = helper.read_param(file_location)

the_params.set_param('tj',1)
the_params.set_param('wj',0)
hp_stash = wc.get_stuff(objects.caW, the_params, False, False, False)

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

f = open('bin/get_features_parallel_'+str(rank), 'w', 0)

data_list = wc.get_stuff(objects.ciW, the_params, global_stuff.recalculate, True, True)
works = []
for i in range(len(data_list)):
    if i % size == rank:
        pdb_name = data_list[i].pdb_name
        chain_letter = data_list[i].chain_letter
        start = data_list[i].start
        end = data_list[i].end
new_folders = []



for folder in pdb_folders:

    s = folder.strip().split('_')


    pdb_name = s[0]
    params = param.param({'p':pdb_name})

    while 1:
        try:
            g = wc.get_stuff(objects.fW, params, False, False, False)
        except Exception, err:
            print err
            import time
            time.sleep(20)
        else:
            break

    structure = Bio.PDB.PDBParser().get_structure(params.get_param('p'), g)
    if s[1] == '':
        letter = structure[0].child_dict.keys()[0]
    else:
        letter = s[1]
    params.set_param('c', letter)
    if s[2] == '':
        
Exemplo n.º 19
0
#info_file = constants.INFO_FOLDER + info_file

relative_folder, the_params = helper.read_param(info_file)



#num_outer_fold = 3
#which_outer_fold = 2
#num_pieces = 330
#which_piece = 96

the_params.set_param('tj', num_pieces)
the_params.set_param('wj', which_piece)


the_data = wc.get_stuff(objects.brW, the_params, False, True, False)
the_params.set_param('s', the_data)
the_params.set_param('m', num_outer_fold)
the_params.set_param('k', which_outer_fold)






the_fold = wc.get_stuff(objects.buW, the_params, False, False, False)

import cross_validation_pseudo as cv

the_params.set_param('f', the_fold)
asdf = wc.get_stuff(objects.cbW, the_params, False, False, False)
Exemplo n.º 20
0
def PID_to_MRN(pid):
    import wc, objects
    m = wc.get_stuff(objects.PID_to_MRN_dict, param.param())
    return m[pid]
Exemplo n.º 21
0
def predict_position_energy_weighted(params, mutation, use_neighbor, ignore_pos, max_neighbor, num_trials, pseudo_total, sim_f, to_neighbor_p_value):

    import wc
    import objects

    protein_name = mutation[0]
    pos = mutation[1]
    wild_res = mutation[2]
    mut_res = mutation[3]
    params.set_param('uniprot_id', protein_name)
    seq = wc.get_stuff(objects.dW, params)

    assert seq[pos] == wild_res


    import wrapper

    msa = wc.get_stuff(wrapper.my_msa_obj_wrapper, params)



    score = 0
    


    #seq_weights = [1.0 for i in range(len(msa))]

    #seq_weights = 

    #params.set_param('which_msa', 0)
    
    node_msa = wc.get_stuff(wrapper.my_msa_obj_wrapper, params)

    column = node_msa.get_column(pos)
    node_seq_weights = wc.get_stuff(objects.general_seq_weights, params)
    
    #params.set_param('which_msa', 2)
    msa = wc.get_stuff(wrapper.my_msa_obj_wrapper, params)

    neighbor_seq_weights = wc.get_stuff(objects.general_seq_weights, params)



    if not ignore_pos:
        #mut_weight = sum([seq_weights[i] for i in range(len(msa)) if msa[pos,i] == mut_res])
        #wild_weight = sum([seq_weights[i] for i in range(len(msa)) if msa[pos,i] == wild_res])

        
        #mut_count = column.count(mut_res)
        #wild_count= column.count(wild_res)
        #if wild_similarity < 1:
        #    print wild_res, mut_res, column
        #    pdb.set_trace()
        
        mut_similarity = compute_similarity_score_to_residue(column, node_seq_weights, mut_res, sim_f)
        wild_similarity = compute_similarity_score_to_residue(column, node_seq_weights, wild_res, sim_f)
        #score += math.log((mut_similarity + 1) / wild_similarity)
        score += (mut_similarity + 1) / wild_similarity

        #assert abs(score - second_score) < .001
        #score += math.log((mut_weight + 1) / (wild_weight))
    
        #score = -1.0 * mutation[-3]
    
    neighbor_score = 0
    
    if use_neighbor:

        # get neighbors/weights

        all_neighbors = wc.get_stuff(objects.general_neighbors_w_weight_w, params)
        pos_neighbors = all_neighbors[pos]
        sorted_pos_neighbors = sorted(pos_neighbors, key = lambda elt: elt[1], reverse = True)

        neighbors = [x[0] for x in sorted_pos_neighbors[0:min(max_neighbor,len(sorted_pos_neighbors))]]
        #neighbor_weights = [x[1] for x in sorted_pos_neighbors[0:min(max_neighbor,len(sorted_pos_neighbors))]]
        neighbor_weights = [1.0 for i in range(len(neighbors))]

        # get pseudo_counts
        pseudo_count_dict = {}
        for key in range(global_stuff.q):
            pseudo_count_dict[key] = pseudo_total / global_stuff.q





        # none of weights have to be normalized
        def get_neighbor_score(msa, weight_a, weight_b, neighbors, neighbor_weights, pseudo_count_dict):

            num_neighbors = len(neighbors)
            assert(len(neighbors) == len(neighbor_weights))
            score = 0
            neighbor_weights = normalize(neighbor_weights)

            #neighbors = range(num_neighbors)
            

            for i in range(num_neighbors):
                choose_neighbor_probs = {}
                for j in range(global_stuff.q):
                    choose_neighbor_probs[j] = 0
                choose_neighbor_probs[global_stuff.aa_to_num[seq[neighbors[i]]]] = 1.0
                #score += neighbor_weights[i] * get_KL_weighted(msa.get_column(neighbors[i]), weight_a, weight_b, pseudo_count_dict, choose_neighbor_probs)



                #score += neighbor_weights[i] * get_KL_weighted(msa.get_column(neighbors[i]), weight_a, weight_b, pseudo_count_dict, choose_neighbor_probs) / get_entropy_weighted(msa.get_column(neighbors[i]), neighbor_seq_weights, pseudo_count_dict)
                asdf = get_KL_weighted(msa.get_column(neighbors[i]), weight_a, weight_b, pseudo_count_dict, choose_neighbor_probs)

                random_kls = get_random_KLs(msa.get_column(neighbors[i]), neighbor_seq_weights, sum(weight_a), sum(weight_b), pseudo_count_dict, num_trials)
                #score += asdf * neighbor_weights[i]
                score += p_value_z(random_kls, asdf) * neighbor_weights[i]
                #score += neighbor_weights[i] * (asdf / mean(random_kls))
                #score += rank(random_kls, asdf) * neighbor_weights[i]
                
            return score

        
                    


        actual_weight_a = [neighbor_seq_weights[i] if msa[i,pos] == wild_res else 0.0 for i in range(len(msa))]
        actual_weight_b = [neighbor_seq_weights[i] if msa[i,pos] == mut_res else 0.0 for i in range(len(msa))]


        neighbor_cols = [ [msa[j,i] for j in range(len(msa))] for i in neighbors]


        actual_neighbor_score = get_neighbor_score(msa, actual_weight_a, actual_weight_b, neighbors, neighbor_weights, pseudo_count_dict)

        if actual_neighbor_score < 0:
            print actual_neighbor_score

        



        if to_neighbor_p_value:





            mut_weight = sum([neighbor_seq_weights[i] for i in range(len(msa)) if msa[i,pos] == mut_res])
            wild_weight = sum([neighbor_seq_weights[i] for i in range(len(msa)) if msa[i,pos] == wild_res])
            random_scores = []
            for i in range(num_trials):
                random_weight_a = get_random_weight(neighbor_seq_weights, wild_weight)
                random_weight_b = get_random_weight(neighbor_seq_weights, mut_weight)





                a_random_score = get_neighbor_score(msa, random_weight_a, random_weight_b, neighbors, neighbor_weights, pseudo_count_dict)

                random_scores.append(a_random_score)


            normalize_neighbor_by_z = True
        
            if normalize_neighbor_by_z:
                random_mean = mean(random_scores)
                random_sd = sd(random_scores)
        
                try:
                    neighbor_score = normalize_to_unit(actual_neighbor_score, random_mean, random_sd)
                except:

                    asdf=2
                    neighbor_score = 0


            else:

                neighbor_score = rank(random_scores, actual_neighbor_score)



        else:
            neighbor_score = actual_neighbor_score

    print >> sys.stderr, score, neighbor_score, len(msa)


    return (score - neighbor_score) * -1.0
num_trials = int(sys.argv[5])
pseudo_total = float(sys.argv[6])
to_neighbor_p_value = sys.argv[7] == 'T'

import global_stuff
params = global_stuff.get_param()


import helper
helper.parse_p_input(params, sys.argv[8:])




#l = wc.get_stuff(objects.filtered_mutation_list_given_protein_list, params)
l = wc.get_stuff(objects.filtered_mutation_list, params)

i = 0
my_l = []
for m in l:
    if i % size == rank:
        my_l.append(m)
    i += 1

import objects

print rank, len(my_l)

which_dataset = params.get_param('which_dataset')

if which_dataset == 'cosmic' or which_dataset == 'their_cosmic':
import sys
from param import param

file_location = sys.argv[1]


wrappers = [objects.bhW, objects.cfW]
to_pickle = [True, True]

folder_name, the_params = helper.read_param(file_location)


the_params.set_param('tj',1)
the_params.set_param('wj',0)

hp_stash = wc.get_stuff(objects.caW, the_params, False, False, False)




f = open('bin/get_features_serial', 'w', 0)

data_list = wc.get_stuff(objects.ciW, the_params, False, True, True)
works = []

all_keys = [ ['nvjd'],  ['wjd', 'wpw']]


for i in range(len(data_list)):

    pdb_name = data_list[i].pdb_name
Exemplo n.º 24
0
import _test

import wc

import new_new_objects as objects



import pdb

_test.init_crf()

from param import param


#import run_small_search
import helper

import sys
info_file = sys.argv[1]

relative_folder, the_params = helper.read_param(info_file)

the_params.set_param('tj',1)

results = wc.get_stuff(objects.ceW, the_params, False, True, True)
pdb.set_trace()
print 3
Exemplo n.º 25
0
#pdb.set_trace()
#print m.get_fragment(a, 10)


#print m.get_match(a, ['asdf'])

#pdb.set_trace()

sosv = bf.single_ordinal_single_value_wrapper_feature


p = global_stuff.get_param()

#A = set(wc.get_stuff(objects.PID_with_SS_info, p))

A = set(wc.get_stuff(objects.prostate_PID,p))

B = set(wc.get_stuff(objects.PID_with_shared_MRN, p))
C = set(wc.get_stuff(objects.PID_with_multiple_tumors, p))
PID_to_use = list(A - B - C)[:3000]

#test_PID_to_use = PID_to_use[2100:2120]

#the_data_set = helper.data_set.data_set_from_pid_list(test_PID_to_use, p)






for pid in PID_to_use: