def G_Bn(sdii_obj, bootstrap_indexSet, t, varset, order): B = len(bootstrap_indexSet) # number of bootstrap count = 0 for i in range(B): #print 'G_Bn()::bootstrap # %d' % i t0 = time.time() #print 'G_Bn()::bootstrap index[1:10]: %s' % str(bootstrap_index[i][0:10]) data_1 = bootstrap_data(sdii_obj.data, bootstrap_indexSet[i], len(varset)) sdii_bootstrap = sdii(data_1) # new hashing object for new data ''' print 'G_Bn()::data_1 shape: %s' % repr(data_1.shape) print 'G_Bn()::data_1 : %s' % repr(data_1) print print 'G_Bn()::data : %s' % repr(data) exit() ''' for s in set(itertools.combinations(varset, order)): # generate all variable subset with length of 2 # varset = Set([2,4,6]), order = 2 # set([(2, 6), (2, 4), (4, 6)]) if sdii_bootstrap.T_l(list(s)) >= t: # using the hash table in sdii_bootstrap count+=1 t1 = time.time() print 'G_Bn():: # of T >= t : %d, t: %f, count*(1/B): %f' % (count, t, (1.0/B)*count) return (1.0/B)*count
def forward_selection(data, alpha, varset, order, B): global alphabet ret_varset = Set() #outfile = 'result_proc_sdii_test_%d.txt' % order #fout = open(outfile, 'w') print 'forward_selection()::varset: %s, order: %d' % (repr(varset), order) sdii_core = sdii(data) th = threshold_t_B(sdii_core, alpha, varset, order, B) print 'forward_selection()::threshold of order [%d]: %f' % (order, th) ''' # generate all variable subset with length of order from varlist for s in set(itertools.combinations(varset, order)): ss = Set(s) #print 'forward_selection()::s: %s' % repr(s) if len(ss.intersection(varset)) == 0: print 'forward_selection()::%s is NOT in %s. skip' % (repr(ss), repr(varset)) continue sdii_value = sdii_core.calc_sdii(list(s)) fout.write('%s %.15f\n' % (''.join([(alphabet[i]) for i in s]), sdii_value)) if sdii_value >= th: for var in s: ret_varset.add(var) print 'forward_selection()::Writing %s' % outfile fout.close() return ret_varset ''' return th
def main(): global alphabet aa_alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T'] na_alphabet = [ 'AB', 'AE', 'CB', 'CE', 'DB', 'DE', 'EB', 'EE', 'FB', 'FE', 'GB', 'GE', 'HB', 'HE', 'IB', 'IE', 'KB', 'KE', 'LB', 'LE', 'MB', 'ME', 'NB', 'NE', 'PB', 'PE', 'QB', 'QE', 'RB', 'RE', 'SB', 'SE', 'TB', 'TE', 'VB', 'VE', 'WB', 'WE', 'YB', 'YE' ] if len(sys.argv) < 3: print 'Usage: python proc_sdii.py var_type score_file' return vartype = sys.argv[1] if vartype == 'AA': alphabet = aa_alphabet print 'use AA varset : %s' % repr(alphabet) elif vartype == 'NA': alphabet = na_alphabet print 'use NA varset : %s' % repr(alphabet) scorefile = sys.argv[2] print 'score file: %s' % scorefile outfile = '%s.sdii' % scorefile print 'write to %s' % outfile score = np.loadtxt(scorefile, delimiter=',') #print score.shape[0] ''' t1 = time.time() varset = range(len(alphabet)) th2 = forward_selection(score, 0.1, varset, 2, 300) th3 = forward_selection(score, 0.1, varset, 3, 300) t2 = time.time() print 'Threshold of order 2: %f' % th2 print 'Threshold of order 3: %f' % th3 print 'use %d seconds' % (t2 - t1) return ''' sdii_core = sdii(score) fout = open(outfile, 'w') print 'calculating mutual information ...' t0 = time.time() for s in set(itertools.combinations(list(range(len(alphabet))), 2)): # generate all variable subset with length of 2 fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), sdii_core.calc_sdii(list(s)))) t1 = time.time() print 'MI time: %d seconds' % (t1-t0) print 'calculating DeltaK(3) ...' for s in set(itertools.combinations(list(range(len(alphabet))), 3)): # generate all variable subset with length of 3 fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), sdii_core.calc_sdii(list(s)))) t2 = time.time() print 'DeltaK(3) time: %d seconds' % (t2-t1)
def forward_selection(data, alpha, varset, order, B): global alphabet ret_varset = Set() print 'forward_selection()::varset: %s, order: %d' % (repr(varset), order) sdii_core = sdii(data) th = threshold_t_B(sdii_core, alpha, varset, order, B) print 'forward_selection()::threshold of order [%d]: %f' % (order, th) return th
def main(): # test msa weight msafile = 'test_msa.txt' target = '1k2p' m = msa(msafile, target) score, varlist = m.msaboard(0.0, 0.5) print score sdii_core = sdii(score) print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T) weight = np.loadtxt('test_msa.weight', delimiter=',') sdii_core.setWeight(weight) print sdii_core.w_entropy(sdii_core.data[:,[0,1]].T) print sdii_core.weight print 'sum(weight): %f' % sum(sdii_core.weight)
def init(): if len(sys.argv) < 6: print "Usage: python mproc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order" print "Example 1: python mproc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3" print "Example 2: python mproc_coevol_sdii.py PF07714_full.fa.s62 NA 0.6 BTK_HUMAN all 3" return msafile = sys.argv[1] weightfile = sys.argv[2] drop_cutoff = float(sys.argv[3]) # for reduce columns targetHeader = sys.argv[4] target = sys.argv[5].lower() order = int(sys.argv[6]) print "msafile: [%s]" % msafile print "weightfile: [%s]" % weightfile print "drop_cutoff: [%f]" % drop_cutoff print "target msa header: [%s]" % targetHeader print "target var: [%s]" % target print "order: [%d]" % order outfile = "%s.%s_%d_sdii" % (msafile, target, order) print "write to [%s]" % outfile # msa init m = msa(msafile) m.setTarget(targetHeader) print "original data dimension: (%d, %d)" % (m.seqNum, m.seqlen) # weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab score, varlist = m.msaboard(drop_cutoff) # , weight_cutoff) # return a compact score print "reduced data dimension: %s" % repr(score.shape) if (target != "all") and (int(target) not in varlist): print "The alignment for var %s is not significant. exit." % target return # sdii init sdii_core = sdii(score) print "Loading weight ..." if weightfile.upper() != "NA": pfam_weight = np.loadtxt(weightfile, delimiter=",") print "Weight vector: %s" % repr(pfam_weight.shape) print "Applying weight to sdii data ..." sdii_core.setWeight(pfam_weight) # set sequence weight else: print "setting weight: %r" % sdii_core.isWeighted print "Setting varlist to sdii ..." sdii_core.setVarlist(varlist) # set sequence weight print "Setting target variable ..." sdii_core.setTarget(target) print "Setting task order ..." sdii_core.setOrder(order) # tasklist init # calculating total tasks tasks = [] if target == "all": print "generating tasks for all ..." for s in set(itertools.combinations(list(range(len(varlist))), order)): tasks.append(list(s)) print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order) else: print "generating tasks for variable %s" % target for s in set(itertools.combinations(list(range(len(varlist))), order - 1)): target_idx = varlist.index(int(target)) if target_idx not in s: st = list(s) st.append(target_idx) tasks.append(st) print "In total %d/%d for order %d." % (len(tasks), binom(len(varlist), order), order) sdii_core.setTotalTask(len(tasks)) # split tasks into blocks tasklist = [] n = len(tasks) / 20 + 1 for i in xrange(0, len(tasks), n): tasklist.append(tasks[i : i + n]) print "spliting tasks into %d blocks" % len(tasklist) print "init done." return (sdii_core, tasklist, outfile)
def main(): global alphabet if len(sys.argv) < 6: print 'Usage: python proc_coevol_sdii.py msafile weightfile cutoff target_seq msapos order' print 'Example: python proc_coevol_sdii.py PF07714_full.fa.r50 PF07714_full.fa.r50.weight 0.6 BTK_HUMAN 3128 3' return msafile = sys.argv[1] weightfile = sys.argv[2] drop_cutoff = float(sys.argv[3]) # for reduce columns targetHeader = sys.argv[4] target = sys.argv[5].lower() order = int(sys.argv[6]) print 'msafile: [%s]' % msafile print 'weightfile: [%s]' % weightfile print 'drop_cutoff: [%f]' % drop_cutoff print 'target msa header: [%s]' % targetHeader print 'target var: [%s]' % target print 'order: [%d]' % order outfile = '%s.%s_%d_sdii' % (msafile, target, order) print 'write to [%s]' % outfile m = msa(msafile) m.setTarget(targetHeader) print 'original data dimension: (%d, %d)' % (m.seqNum, m.seqlen) #weight_cutoff = 0.3 # for weighting msa sequence # taken care of in matlab score, varlist = m.msaboard(drop_cutoff) #, weight_cutoff) # return a compact score print 'reduced data dimension: %s' % repr(score.shape) ''' score: A..C..D.EF index: 0123456789 # after reduction score: ACDE index: 0123 -> input in sdii calculation index: 0368 = varlist = alphabet ''' alphabet = [str(i) for i in varlist] #print alphabet #m.writeScoreboard('1k2p_PF07714_seed.score') if (target != 'all') and (int(target) not in varlist): print 'The alignment for var %s is not significant. exit.' % target return if target == 'all': pk = binom(len(varlist), order) else: pk = binom(len(varlist), order-1) - len(varlist) - 1 print 'total calculations: %d' % pk print 'Loading weight ...' pfam_weight = np.loadtxt(weightfile, delimiter=',') print 'Weight vector: %s' % repr(pfam_weight.shape) sdii_core = sdii(score) print 'Applying weight to sdii data ...' sdii_core.setWeight(pfam_weight) # set sequence weight fout = open(outfile, 'w') t0 = time.time() count = 0 for s in set(itertools.combinations(list(range(len(alphabet))), order)): if (target == 'all') or (alphabet.index(target) in s): count+=1 print '%d/%d: %s ' % (count, pk, '-'.join([(alphabet[i]) for i in s])) ret_sdii = sdii_core.calc_sdii(list(s)) t1 = time.time() print 'time used: %d seconds\n' % (t1-t0) fout.write('%s %.15f\n' % ('-'.join([(alphabet[i]) for i in s]), ret_sdii)) t0 = t1 fout.close()
def init(): if len(sys.argv) < 3: print 'Usage: python mp_ce_sdii_rcrr.py MSATitle targetVar order' print 'Example 1: python mp_ce_sdii_rcrr.py PF07714_full.fa 3128 3' print 'Example 1: python mp_ce_sdii_rcrr.py PF07714_full.fa all 3' return scoreFile = sys.argv[1]+'.score' rowIndexFile = sys.argv[1]+'.row' colIndexFile = sys.argv[1]+'.col' targetVar = sys.argv[2].lower() order = int(sys.argv[3]) print 'score file: [%s]' % scoreFile print 'row index file: [%s]' % rowIndexFile print 'column index file: [%s]' % colIndexFile print 'target var: [%s]' % targetVar print 'order: [%d]' % order outfile = '%s.%s_%d_sdii' % (sys.argv[1], targetVar, order) print 'write to [%s]' % outfile # msa init score = np.loadtxt(scoreFile, delimiter=',') rowIndex = [int(i) for i in np.loadtxt(rowIndexFile, delimiter=',')] colIndex = [int(j) for j in np.loadtxt(colIndexFile, delimiter=',')] print 'row index: %s' % repr(rowIndex) print 'col index: %s' % repr(colIndex) print 'reduced data dimension: %s, (%d, %d)' % (repr(score.shape), len(rowIndex), len(colIndex)) varlist = colIndex if (targetVar != 'all') and (int(targetVar) not in varlist): print 'The alignment for var %s is not significant. exit.' % targetVar return # sdii init sdii_core = sdii(score) print 'Setting varlist to sdii ...' sdii_core.setVarlist(varlist) # set sequence weight print 'Setting target variable ...' sdii_core.setTarget(targetVar) print 'Setting task order ...' sdii_core.setOrder(order) print repr(varlist) # tasklist init # calculating total tasks tasks = [] if targetVar == 'all': print 'generating tasks for all ...' for s in set(itertools.combinations(list(range(len(varlist))), order)): tasks.append(list(s)) print 'In total %d/%d for order %d.' % (len(tasks), binom(len(varlist), order), order) else: print 'generating tasks for variable %s' % targetVar for s in set(itertools.combinations(list(range(len(varlist))), order-1)): target_idx = varlist.index(int(targetVar)) if target_idx not in s: st = list(s) st.append(target_idx) tasks.append(st) print 'In total %d/%d for order %d.' % (len(tasks), binom(len(varlist), order), order) sdii_core.setTotalTask(len(tasks)) # split tasks into blocks tasklist = [] n = len(tasks)/20 +1 for i in xrange(0, len(tasks), n): tasklist.append(tasks[i:i+n]) print 'spliting tasks into %d blocks' % len(tasklist) print 'init done.' return (sdii_core, tasklist, outfile)