def test_create_networkx_graph_duplicates(self): networks_table = pd.DataFrame({'GUID' : ['guid'], 'Name': ['net']}) edge_table = pd.DataFrame({'Source': ['1', '1'], 'Target': ['2', '2']}) node_table = pd.DataFrame({'Node': ['1', '1', '2']}) import warnings with warnings.catch_warnings(record=True) as w: graphs = nx.from_perseus(networks_table, {'guid': {'name' : 'net', 'guid': 'guid', 'edge_table': edge_table, 'node_table': node_table}}) self.assertEqual(2, graphs[0].number_of_nodes()) self.assertEqual(1, graphs[0].number_of_edges()) warning_messages = [str(x.message) for x in w if 'deprecated' not in str(x.message)] self.assertTrue('Duplicate edges' in warning_messages[0], warning_messages[0]) self.assertTrue('Duplicate nodes' in warning_messages[1], warning_messages[1])
def test_write_bool_column_as_categorical(self): df = pd.DataFrame(columns=pd.Index(['Significant'], name='Column Name')) df['Significant'] = [True, False, True, True] self.assertEqual(df.dtypes[0], np.dtype('bool')) df_str = to_string(df) self.assertEqual('Significant\n#!{Type}C\n+\n""\n+\n+\n', df_str, df_str) df_str = to_string(df, convert_bool_to_category=False) self.assertEqual('Significant\n#!{Type}C\nTrue\nFalse\nTrue\nTrue\n', df_str, df_str)
def test_inferring_and_setting_main_columns(self): df = pd.DataFrame({ 'a': [2, 3], 'b': [1, 2], 'c': ['a', 'b'], 'd': [3, 4] }) self.assertEqual('#!{Type}E\tE\tT\tN', type_row(df)) self.assertEqual('#!{Type}N\tE\tT\tE', type_row(df, main_columns={'b', 'd'}))
def countAA(maxcluster): dpAAs = pd.DataFrame(['nterm','cterm','G','A','V','L','I','P','F','Y','W','S','T',\ 'C','M','N','Q','K','R','H','D','E']) dpAAs['Count'] = 0 for x in range(len(maxcluster) - 1): x = x + 1 i = dpAAs['Count'][dpAAs[0].isin(list(maxcluster['DP AA'][x]\ .split(';')))].index dpAAs['Count'].loc[i] = dpAAs['Count'].loc[i] + 1 return dpAAs
def pltcountperRaw(): x = int(input('How many files do you want to plot together? ')) a = openfile() withoutunmodified = a[a['DP Modification'] == 'Unmodified'] unmod = pd.DataFrame(withoutunmodified['Raw file'].value_counts()) values = pd.DataFrame(a['Raw file'].value_counts()) # ============================================================================= # #just for MSFragger files # b = openfile() # b['Raw file'] = b['Spectrum'].apply(lambda df: df.split('.')[0]) # b = b[b['Observed Modifications']!='Unknown'] # values['MSFragger identifications'] = b['Raw file'].value_counts().values # ============================================================================= for y in range(x - 1): a = openfile() withoutunmodified = a[a['DP Modification'] == 'Unmodified'] values['File ' + str(y)] = a['Raw file'].value_counts().values unmod['File ' + str(y)] = withoutunmodified['Raw file'].value_counts().values values = values.sort_index() unmod = unmod.sort_index() values = values.reset_index(drop=True) unmod = unmod.reset_index(drop=True) if x == 3: values = values.rename(index = str, columns = {'Raw file':'No matching',\ 'File 0':'+/-1 fraction matching',\ 'File 1':'Complete matching'}) unmod = unmod.rename(index = str, columns = {'Raw file':'No matching',\ 'File 0':'Matching restricted to one raw file',\ 'File 1':'Matching between all raw files'}) ax = values.plot.bar(rot=0, fontsize=20) #unmod.plot.bar(ax=ax,color=['cornflowerblue', 'orange', 'limegreen'],fontsize = 15,legend = False,rot=0) ax.legend(fontsize=20) ax.set_xticklabels(list(range(1, len(values) + 1))) ax.set_xlabel('Raw file', fontsize=20) ax.set_ylabel('Count', fontsize=20) return unmod
def findallin (cache,position,thestr,char1,char2): # initialize cache,postiton as [] if(thestr.find(char1) > -1 and thestr.find(char2) > -1): substr = thestr[thestr.find(char1)+1:thestr.find(char2)] subsuperstr = thestr[thestr.find(char2)+1:len(thestr)] cache.append(substr) if not position: position.append(thestr.find(char1)-1) else: idex = position[len(position)-1] + thestr.find(char1) position.append(idex) findallin(cache,position,subsuperstr,char1,char2) df = pd.DataFrame(cache) df.insert(1,1,position) return df
def countAA(maxcluster): # to do: include nterm, cterm here: dpAAs = pd.DataFrame(['nterm','cterm','G','A','V','L','I','P','F','Y','W','S','T',\ 'C','M','N','Q','K','R','H','D','E']) dpAAs['Count'] = 0 dpAAs['nterm'] = 0 dpAAs['middle'] = 0 dpAAs['cterm'] = 0 for x in range(len(maxcluster) - 1): x = x + 1 i = dpAAs['Count'][dpAAs[0].isin(list(maxcluster['DP AA'][x]\ .split(';')))].index dpAAs['Count'].loc[i] = dpAAs['Count'].loc[i] + 1 if i[0] == 0: dpAAs['nterm'].loc[i[1]] = dpAAs['nterm'].loc[i[1]] + 1 elif i[0] == 1: dpAAs['cterm'].loc[i[1]] = dpAAs['cterm'].loc[i[1]] + 1 else: dpAAs['middle'].loc[i] = dpAAs['middle'].loc[i] + 1 return dpAAs
parameters = parse_parameters(paramfile) # parse the parameters file df = pd.read_perseus(infile) # read the input matrix into a pandas.DataFrame n_neighbor = intParam(parameters, "Number of neighbors") n_component = intParam(parameters, "Number of components") seed = intParam(parameters, "Random state") m_dist = doubleParam(parameters, "Minimum distance") metric = singleChoiceParam(parameters, "Metric") annotations = read_annotations(infile) if len(annotations) < 2: sys.exit("The data needs to be grouped") newDF1 = main_df(infile, df) newDF1 = newDF1.T embedding = umap.UMAP(n_neighbors=n_neighbor, n_components=n_component, metric=metric, random_state=seed, min_dist=m_dist).fit_transform(newDF1) new_annotations = {} check_c = {} c_num = 1 for k, v in annotations.items(): if "C:" in k: col_n = k.replace("C:", "") new_annotations[col_n] = v annotation_df = pd.DataFrame.from_dict(new_annotations) col_names = [] for i in range(0, n_component): col_names.append("Component " + str(i + 1)) newDF2 = pd.DataFrame(data=embedding, columns=col_names) newDF2 = pd.concat([newDF2.reset_index(drop=True), annotation_df], axis=1) newDF2.to_perseus(outfile) # write pandas.DataFrame in Perseus txt format
def test_writing_empty_table_should_have_all_columns(self): df = pd.DataFrame(columns=pd.Index(['Node'], name='Column Name')) self.assertEqual(1, len(df.columns)) self.assertEqual('Column Name', df.columns.name) df_str = to_string(df) self.assertEqual('Node\n#!{Type}T\n', df_str, df_str)
def testMQ(): root = Tk() root.withdraw() a = filedialog.askopenfilename(initialdir = "D:\Thomas\PhytonSCripts\MQOutput" \ ,title = "Choose a MQ nomatch.deppep file to plot",\ filetypes = (("deppep files","*.deppep"),("all files","*.*"))) a = pd.read_table(a, low_memory=False) a = a.drop(0).reset_index(drop=True) b = filedialog.askopenfilename(initialdir = "D:\Thomas\PhytonSCripts\MQOutput" \ ,title = "Choose a MQ matching.deppep file to plot",\ filetypes = (("deppep files","*.deppep"),("all files","*.*"))) b = pd.read_table(b, low_memory=False) b = b.drop(0).reset_index(drop=True) arg = a[[ 'Raw file', 'DP Base Raw File', 'DP Proteins', 'DP Base Sequence' ]] argb = b[[ 'Raw file', 'DP Base Raw File', 'DP Proteins', 'DP Base Sequence' ]] uniq = arg['DP Base Sequence'].unique() uniqb = argb['DP Base Sequence'].unique() out = uniq[np.in1d(uniq, uniqb)] df = pd.DataFrame({'Raw Files': arg['Raw file'].unique()}) df['counta'] = 0 df['countb'] = 0 count = 0 countb = 0 for x in range(len(out)): coin = arg[arg['DP Base Sequence'] == out[x]] coinb = argb[argb['DP Base Sequence'] == out[x]] coincount = np.in1d(coin['Raw file'].unique(), coinb['Raw file'].unique()) coinbcount = np.in1d(coinb['Raw file'].unique(), coin['Raw file'].unique()) count = count + coincount[coincount == False].size countb = countb + coinbcount[coinbcount == False].size for y in (coin['Raw file'].unique()[coincount == False]): df['counta'][df['Raw Files'] == y] = df['counta'][df['Raw Files'] == y] + 1 for z in (coinb['Raw file'].unique()[coinbcount == False]): df['countb'][df['Raw Files'] == z] = df['countb'][df['Raw Files'] == z] + 1 df.plot.bar() print('Amount of peptide sequences found in the Raw Files of file 1 but not'\ ' found in the Raw Files of file 2 is ',count, '. For file 2: ', countb) print('!!!Both peptide sequences need to be present in both files!!!') print('Amount of dp. Peptides in file 1: ', len(a), ' in file 2: ', len(b)) values = pd.DataFrame(a['Raw file'].value_counts()) values['Raw'] = b['Raw file'].value_counts().values values.columns = ['No match', 'Matching'] values = values.sort_index() values.plot.bar() a['Intensity'] = a['Intensity'].astype(float) b['Intensity'] = b['Intensity'].astype(float) raw_tablea = pd.pivot_table(a, values = 'Intensity',\ index = ['DP Cluster Index','DP AA','DP Base Sequence','DP Probabilities'], columns = 'Raw file') raw_tableb = pd.pivot_table(b, values = 'Intensity',\ index = ['DP Cluster Index','DP AA','DP Base Sequence','DP Probabilities'], columns = 'Raw file') raw_tablea = raw_tablea.reset_index() raw_tableb = raw_tableb.reset_index() return raw_tablea, raw_tableb
def plotsMQ(mq_dep, range_int, zerovar, minbin, binsize, binrange): # Pie Plot of MQ found Modifications print('MaxQuant Plot Parameters: min. modification: '+ str(range_int) + \ '; removed '+str(zerovar)+ 'bins around zero'+'; min. binsize : '\ +str(minbin)+'; binsize: '+str(binsize)) plt.figure() dfMQ = mq_dep.groupby('DP Modification')\ .size().reset_index(name='No: of Modifications') #make range of intrest relative to datasize? range_intrest = range_int df2 = dfMQ[dfMQ['No: of Modifications'] > range_intrest] df3 = dfMQ[dfMQ['No: of Modifications'] < range_intrest] others = 0 df2 = df2.reset_index(drop=True) df3 = df3.reset_index(drop=True) for x in range(len(df3)): others = others + df3['No: of Modifications'][x] df2.loc[len(df2)] = ['Others', others] df2.set_index('DP Modification', inplace=True) #df2.drop('Unknown', inplace=True) #dont kick them out df2 = df2['No: of Modifications'].sort_values() df2.plot.pie(y='No: of Modifications', legend=False).set_ylabel('') # Bar PLot for MQ Mass Bins plt.figure() stepsize = binsize #adjust binrange here bins = np.arange(-binrange, binrange, stepsize) var = round(len(bins) / 2) sorted = round(mq_dep.sort_values('DP Cluster Mass')) sorted['DP Cluster Mass'] = round(sorted['DP Cluster Mass'].astype(float)) sorted = sorted['DP Cluster Mass'].values hist, bin_edges = np.histogram(sorted, bins=bins) hist[var] = 0 # remove unknown modifications around 0 # adjusted Delta mass # --> prob. no modification #hist[hist>0] = np.log2(hist[hist>0]) plt.xlabel('Mass Bins', fontsize=15) plt.ylabel('log-scale quantity', fontsize=13) plt.bar(bin_edges[:-1], hist, width=1, log=True) # plt.ylim([0,np.]) plt.show() # Pie Plot for Bins hist = np.append(hist, 0) plt.figure() histMQ = {'Histodata': hist, 'Bins': bins} dMQ = pd.DataFrame(histMQ) d2 = dMQ[dMQ['Histodata'] > minbin] round_bin = np.around(d2['Bins'], decimals=2) round_bin = round_bin.reset_index(drop=True) for x in range(len(round_bin)): round_bin[x] = str(round_bin[x])\ + ' to ' + str(round((round_bin[x] + stepsize),2)) + ' bin' d2.set_index(round_bin, inplace=True) d2 = d2['Histodata'].sort_values() d2.plot.pie(y='Histodata', legend=False).set_ylabel('') return dMQ
def plotsMS(ms_psmopen, range_int, zerovar, minbin, binsize, binrange, nomi): # Pie plot print('MSFragger Plot Parameters: min. modification: '+ str(range_int) + \ '; removed '+str(zerovar)+ 'bins around zero'+'; min. binsize: '\ +str(minbin)+'; binsize: '+str(binsize)) plt.figure() dfMS = ms_psmopen.groupby('Observed Modifications')\ .size().reset_index(name='No: of Modifications') range_intrest = range_int df2 = dfMS[dfMS['No: of Modifications'] > range_intrest] df3 = dfMS[dfMS['No: of Modifications'] < range_intrest] others = 0 df2 = df2.reset_index(drop=True) df3 = df3.reset_index(drop=True) for x in range(len(df3)): others = others + df3['No: of Modifications'][x] df2.loc[len(df2)] = ['Others', others] df2.set_index('Observed Modifications', inplace=True) df2.drop('Unknown', inplace=True) df2 = df2['No: of Modifications'].sort_values() df2.plot.pie(y='No: of Modifications', legend=False).set_ylabel('') # Create new Bar Plot for Mass Bins ??not accurate == WHY?? plt.figure() stepsize = binsize # set x bin range here bins = np.arange(-binrange, binrange, stepsize) var = round(len(bins) / 2) if (nomi == True): sorted = round(ms_psmopen.sort_values('Adjusted Delta Mass')) else: sorted = ms_psmopen.sort_values('Adjusted Delta Mass') hist, bin_edges = np.histogram(sorted['Adjusted Delta Mass'].values, bins=bins) hist[(var)] = 0 # remove unknown modifications around 0 # adjusted Delta mass # --> prob. no modification #hist[hist>0] = np.log2(hist[hist>0]) plt.xlabel('Mass Bins', fontsize=15) plt.ylabel('log-scale quantity', fontsize=13) plt.bar(bin_edges[:-1], hist, log=True, width=1) # plt.ylim([0,np.log(10000000000000)]) plt.show() # Pie PLot of Mass Bins # do only once per program execution hist = np.append(hist, 0) plt.figure() histMS = {'Histodata': hist, 'Bins': bins} dMS = pd.DataFrame(histMS) d2 = dMS[dMS['Histodata'] > minbin] round_bin = np.around(d2['Bins'], decimals=2) round_bin = round_bin.reset_index(drop=True) for x in range(len(round_bin)): round_bin[x] = str(round_bin[x])\ + ' to ' + str(round((round_bin[x] + stepsize),2)) + ' bin' d2.set_index(round_bin, inplace=True) d2 = d2['Histodata'].sort_values() d2.plot.pie(y='Histodata', legend=False).set_ylabel('') return dMS