def filter_multi_orths(arr_in, basesp, cutoff): """ For every interaction without base basespecies evidence, remove other basespecies evidence for that interaction when the base basespecies side of the orthogroup is greater than 1. """ print "Filtering: require rows w/o %s > %s to have single orths" % (basesp, cutoff) arr = ut.arr_copy(arr_in) basesp_cols = [n for n in arr.dtype.names[3:] if n[:2]==basesp] assert len(basesp_cols)>0, 'No base species data.' maxes = arr_collist_maxes(arr, [basesp_cols]) othersps = species_list(arr.dtype.names[3:]) othersps.remove(basesp) spcols = [(sp, [n for n in arr.dtype.names[3:] if n[:2]==sp]) for sp in othersps] ogs_all = orth.all_ogroup_sizes(basesp, othersps) cleared = 0 for i in range(len(arr)): if maxes[i] < cutoff: row = arr[i] id1,id2 = row['id1'],row['id2'] for sp, cols in spcols: ogsize_sp = ogs_all[sp] if (id1 in ogsize_sp and ogsize_sp[id1]>1) or (id2 in ogsize_sp and ogsize_sp[id2]>1): for col in cols: arr[i][col] = 0 cleared += 1 print "%s species-sections of rows cleared" % cleared return arr
def arrfeats_set_gold(arrfeats, pdgold): arrfeats = ut.arr_copy(arrfeats) for row in arrfeats: if pdgold.contains((row[0],row[1])): row[2] = 1 else: row[2] = 0 return arrfeats
def norm_columns(arr): newarr = ut.arr_copy(arr) for n in newarr.dtype.names: newarr[n] = scipy.stats.zscore(np.nan_to_num(newarr[n])) return newarr
def rank_columns(arr): newarr = ut.arr_copy(arr) for n in newarr.dtype.names: newarr[n] = scipy.stats.rankdata(np.nan_to_num(newarr[n])) return newarr
def rescale_columns(arr, scale_factors): newarr = ut.arr_copy(arr) for i,n in enumerate(newarr.dtype.names): #newarr[n] = np.nan_to_num(newarr[n]/np.max(np.nan_to_num(newarr[n]))) newarr[n] = newarr[n] * scale_factors[i] return newarr