Python DataFrame 예제들, perseuspy.pd.DataFrame Python 예제들

예제 #1

0

파일 보기

 def test_create_networkx_graph_duplicates(self):
     networks_table = pd.DataFrame({'GUID' : ['guid'], 'Name': ['net']})
     edge_table = pd.DataFrame({'Source': ['1', '1'], 'Target': ['2', '2']})
     node_table = pd.DataFrame({'Node': ['1', '1', '2']})
     import warnings
     with warnings.catch_warnings(record=True) as w:
         graphs = nx.from_perseus(networks_table, {'guid': {'name' : 'net', 'guid': 'guid', 'edge_table': edge_table, 'node_table': node_table}})
         self.assertEqual(2, graphs[0].number_of_nodes())
         self.assertEqual(1, graphs[0].number_of_edges())
         warning_messages = [str(x.message) for x in w if 'deprecated' not in str(x.message)]
         self.assertTrue('Duplicate edges' in warning_messages[0], warning_messages[0])
         self.assertTrue('Duplicate nodes' in warning_messages[1], warning_messages[1])

예제 #2

0

파일 보기

파일: test_io.py 프로젝트: cox-labs/perseuspy

 def test_write_bool_column_as_categorical(self):
     df = pd.DataFrame(columns=pd.Index(['Significant'], name='Column Name'))
     df['Significant'] = [True, False, True, True]
     self.assertEqual(df.dtypes[0], np.dtype('bool'))
     df_str = to_string(df)
     self.assertEqual('Significant\n#!{Type}C\n+\n""\n+\n+\n', df_str, df_str)
     df_str = to_string(df, convert_bool_to_category=False)
     self.assertEqual('Significant\n#!{Type}C\nTrue\nFalse\nTrue\nTrue\n', df_str, df_str)

예제 #3

0

파일 보기

 def test_inferring_and_setting_main_columns(self):
     df = pd.DataFrame({
         'a': [2, 3],
         'b': [1, 2],
         'c': ['a', 'b'],
         'd': [3, 4]
     })
     self.assertEqual('#!{Type}E\tE\tT\tN', type_row(df))
     self.assertEqual('#!{Type}N\tE\tT\tE',
                      type_row(df, main_columns={'b', 'd'}))

예제 #4

0

파일 보기

파일: Postprocessing_template.py 프로젝트: tomthun/Phyton-Scripts

def countAA(maxcluster):
    dpAAs = pd.DataFrame(['nterm','cterm','G','A','V','L','I','P','F','Y','W','S','T',\
                           'C','M','N','Q','K','R','H','D','E'])
    dpAAs['Count'] = 0
    for x in range(len(maxcluster) - 1):
        x = x + 1
        i = dpAAs['Count'][dpAAs[0].isin(list(maxcluster['DP AA'][x]\
              .split(';')))].index
        dpAAs['Count'].loc[i] = dpAAs['Count'].loc[i] + 1
    return dpAAs

예제 #5

0

파일 보기

def pltcountperRaw():
    x = int(input('How many files do you want to plot together? '))

    a = openfile()
    withoutunmodified = a[a['DP Modification'] == 'Unmodified']
    unmod = pd.DataFrame(withoutunmodified['Raw file'].value_counts())
    values = pd.DataFrame(a['Raw file'].value_counts())

    # =============================================================================
    #     #just for MSFragger files
    #     b = openfile()
    #     b['Raw file'] = b['Spectrum'].apply(lambda df: df.split('.')[0])
    #     b = b[b['Observed Modifications']!='Unknown']
    #     values['MSFragger identifications'] = b['Raw file'].value_counts().values
    # =============================================================================
    for y in range(x - 1):
        a = openfile()
        withoutunmodified = a[a['DP Modification'] == 'Unmodified']
        values['File ' + str(y)] = a['Raw file'].value_counts().values
        unmod['File ' +
              str(y)] = withoutunmodified['Raw file'].value_counts().values

    values = values.sort_index()
    unmod = unmod.sort_index()
    values = values.reset_index(drop=True)
    unmod = unmod.reset_index(drop=True)
    if x == 3:
        values = values.rename(index = str, columns = {'Raw file':'No matching',\
                                'File 0':'+/-1 fraction matching',\
                                'File 1':'Complete matching'})
        unmod = unmod.rename(index = str, columns = {'Raw file':'No matching',\
                                'File 0':'Matching restricted to one raw file',\
                                'File 1':'Matching between all raw files'})

    ax = values.plot.bar(rot=0, fontsize=20)
    #unmod.plot.bar(ax=ax,color=['cornflowerblue', 'orange', 'limegreen'],fontsize = 15,legend = False,rot=0)
    ax.legend(fontsize=20)
    ax.set_xticklabels(list(range(1, len(values) + 1)))
    ax.set_xlabel('Raw file', fontsize=20)
    ax.set_ylabel('Count', fontsize=20)
    return unmod

예제 #6

0

파일 보기

def findallin (cache,position,thestr,char1,char2):
#    initialize cache,postiton as []        
  
    if(thestr.find(char1) > -1 and thestr.find(char2) > -1):    
        substr = thestr[thestr.find(char1)+1:thestr.find(char2)]
        subsuperstr = thestr[thestr.find(char2)+1:len(thestr)]
        cache.append(substr)
        
        if not position: 
            position.append(thestr.find(char1)-1)
        else:
            idex = position[len(position)-1] + thestr.find(char1)
            position.append(idex)
        findallin(cache,position,subsuperstr,char1,char2)
        
    df = pd.DataFrame(cache)
    df.insert(1,1,position)
    return df

예제 #7

0

파일 보기

def countAA(maxcluster):
    # to do: include nterm, cterm here:
    dpAAs = pd.DataFrame(['nterm','cterm','G','A','V','L','I','P','F','Y','W','S','T',\
                           'C','M','N','Q','K','R','H','D','E'])
    dpAAs['Count'] = 0
    dpAAs['nterm'] = 0
    dpAAs['middle'] = 0
    dpAAs['cterm'] = 0
    for x in range(len(maxcluster) - 1):
        x = x + 1
        i = dpAAs['Count'][dpAAs[0].isin(list(maxcluster['DP AA'][x]\
              .split(';')))].index
        dpAAs['Count'].loc[i] = dpAAs['Count'].loc[i] + 1
        if i[0] == 0:
            dpAAs['nterm'].loc[i[1]] = dpAAs['nterm'].loc[i[1]] + 1
        elif i[0] == 1:
            dpAAs['cterm'].loc[i[1]] = dpAAs['cterm'].loc[i[1]] + 1
        else:
            dpAAs['middle'].loc[i] = dpAAs['middle'].loc[i] + 1

    return dpAAs

예제 #8

0

파일 보기

parameters = parse_parameters(paramfile)  # parse the parameters file
df = pd.read_perseus(infile)  # read the input matrix into a pandas.DataFrame
n_neighbor = intParam(parameters, "Number of neighbors")
n_component = intParam(parameters, "Number of components")
seed = intParam(parameters, "Random state")
m_dist = doubleParam(parameters, "Minimum distance")
metric = singleChoiceParam(parameters, "Metric")
annotations = read_annotations(infile)
if len(annotations) < 2:
    sys.exit("The data needs to be grouped")
newDF1 = main_df(infile, df)
newDF1 = newDF1.T
embedding = umap.UMAP(n_neighbors=n_neighbor,
                      n_components=n_component,
                      metric=metric,
                      random_state=seed,
                      min_dist=m_dist).fit_transform(newDF1)
new_annotations = {}
check_c = {}
c_num = 1
for k, v in annotations.items():
    if "C:" in k:
        col_n = k.replace("C:", "")
        new_annotations[col_n] = v
annotation_df = pd.DataFrame.from_dict(new_annotations)
col_names = []
for i in range(0, n_component):
    col_names.append("Component " + str(i + 1))
newDF2 = pd.DataFrame(data=embedding, columns=col_names)
newDF2 = pd.concat([newDF2.reset_index(drop=True), annotation_df], axis=1)
newDF2.to_perseus(outfile)  # write pandas.DataFrame in Perseus txt format

예제 #9

0

파일 보기

 def test_writing_empty_table_should_have_all_columns(self):
     df = pd.DataFrame(columns=pd.Index(['Node'], name='Column Name'))
     self.assertEqual(1, len(df.columns))
     self.assertEqual('Column Name', df.columns.name)
     df_str = to_string(df)
     self.assertEqual('Node\n#!{Type}T\n', df_str, df_str)

예제 #10

0

파일 보기

def testMQ():
    root = Tk()
    root.withdraw()
    a = filedialog.askopenfilename(initialdir = "D:\Thomas\PhytonSCripts\MQOutput" \
           ,title = "Choose a MQ nomatch.deppep file to plot",\
           filetypes = (("deppep files","*.deppep"),("all files","*.*")))
    a = pd.read_table(a, low_memory=False)
    a = a.drop(0).reset_index(drop=True)

    b = filedialog.askopenfilename(initialdir = "D:\Thomas\PhytonSCripts\MQOutput" \
           ,title = "Choose a MQ matching.deppep file to plot",\
           filetypes = (("deppep files","*.deppep"),("all files","*.*")))
    b = pd.read_table(b, low_memory=False)
    b = b.drop(0).reset_index(drop=True)

    arg = a[[
        'Raw file', 'DP Base Raw File', 'DP Proteins', 'DP Base Sequence'
    ]]
    argb = b[[
        'Raw file', 'DP Base Raw File', 'DP Proteins', 'DP Base Sequence'
    ]]

    uniq = arg['DP Base Sequence'].unique()
    uniqb = argb['DP Base Sequence'].unique()
    out = uniq[np.in1d(uniq, uniqb)]
    df = pd.DataFrame({'Raw Files': arg['Raw file'].unique()})
    df['counta'] = 0
    df['countb'] = 0
    count = 0
    countb = 0

    for x in range(len(out)):
        coin = arg[arg['DP Base Sequence'] == out[x]]
        coinb = argb[argb['DP Base Sequence'] == out[x]]
        coincount = np.in1d(coin['Raw file'].unique(),
                            coinb['Raw file'].unique())
        coinbcount = np.in1d(coinb['Raw file'].unique(),
                             coin['Raw file'].unique())
        count = count + coincount[coincount == False].size
        countb = countb + coinbcount[coinbcount == False].size

        for y in (coin['Raw file'].unique()[coincount == False]):
            df['counta'][df['Raw Files'] ==
                         y] = df['counta'][df['Raw Files'] == y] + 1
        for z in (coinb['Raw file'].unique()[coinbcount == False]):
            df['countb'][df['Raw Files'] ==
                         z] = df['countb'][df['Raw Files'] == z] + 1
    df.plot.bar()

    print('Amount of peptide sequences found in the Raw Files of file 1 but not'\
          ' found in the Raw Files of file 2 is ',count, '. For file 2: ', countb)
    print('!!!Both peptide sequences need to be present in both files!!!')
    print('Amount of dp. Peptides in file 1: ', len(a), ' in file 2: ', len(b))

    values = pd.DataFrame(a['Raw file'].value_counts())
    values['Raw'] = b['Raw file'].value_counts().values
    values.columns = ['No match', 'Matching']
    values = values.sort_index()
    values.plot.bar()

    a['Intensity'] = a['Intensity'].astype(float)
    b['Intensity'] = b['Intensity'].astype(float)

    raw_tablea = pd.pivot_table(a, values = 'Intensity',\
    index = ['DP Cluster Index','DP AA','DP Base Sequence','DP Probabilities'], columns = 'Raw file')
    raw_tableb = pd.pivot_table(b, values = 'Intensity',\
    index = ['DP Cluster Index','DP AA','DP Base Sequence','DP Probabilities'], columns = 'Raw file')

    raw_tablea = raw_tablea.reset_index()
    raw_tableb = raw_tableb.reset_index()

    return raw_tablea, raw_tableb

예제 #11

0

파일 보기

def plotsMQ(mq_dep, range_int, zerovar, minbin, binsize, binrange):
    # Pie Plot of MQ found Modifications
    print('MaxQuant Plot Parameters: min. modification: '+ str(range_int) + \
          '; removed '+str(zerovar)+ 'bins around zero'+'; min. binsize : '\
          +str(minbin)+'; binsize: '+str(binsize))
    plt.figure()
    dfMQ = mq_dep.groupby('DP Modification')\
    .size().reset_index(name='No: of Modifications')

    #make range of intrest relative to datasize?
    range_intrest = range_int
    df2 = dfMQ[dfMQ['No: of Modifications'] > range_intrest]
    df3 = dfMQ[dfMQ['No: of Modifications'] < range_intrest]

    others = 0
    df2 = df2.reset_index(drop=True)
    df3 = df3.reset_index(drop=True)
    for x in range(len(df3)):
        others = others + df3['No: of Modifications'][x]

    df2.loc[len(df2)] = ['Others', others]
    df2.set_index('DP Modification', inplace=True)
    #df2.drop('Unknown', inplace=True) #dont kick them out
    df2 = df2['No: of Modifications'].sort_values()
    df2.plot.pie(y='No: of Modifications', legend=False).set_ylabel('')

    # Bar PLot for MQ Mass Bins
    plt.figure()
    stepsize = binsize
    #adjust binrange here
    bins = np.arange(-binrange, binrange, stepsize)
    var = round(len(bins) / 2)
    sorted = round(mq_dep.sort_values('DP Cluster Mass'))
    sorted['DP Cluster Mass'] = round(sorted['DP Cluster Mass'].astype(float))
    sorted = sorted['DP Cluster Mass'].values
    hist, bin_edges = np.histogram(sorted, bins=bins)
    hist[var] = 0
    # remove unknown modifications around 0
    # adjusted Delta mass
    # --> prob. no modification
    #hist[hist>0] = np.log2(hist[hist>0])
    plt.xlabel('Mass Bins', fontsize=15)
    plt.ylabel('log-scale quantity', fontsize=13)
    plt.bar(bin_edges[:-1], hist, width=1, log=True)
    #    plt.ylim([0,np.])

    plt.show()

    # Pie Plot for Bins
    hist = np.append(hist, 0)

    plt.figure()
    histMQ = {'Histodata': hist, 'Bins': bins}
    dMQ = pd.DataFrame(histMQ)
    d2 = dMQ[dMQ['Histodata'] > minbin]

    round_bin = np.around(d2['Bins'], decimals=2)
    round_bin = round_bin.reset_index(drop=True)

    for x in range(len(round_bin)):
        round_bin[x] = str(round_bin[x])\
        + ' to ' + str(round((round_bin[x] + stepsize),2)) + ' bin'

    d2.set_index(round_bin, inplace=True)
    d2 = d2['Histodata'].sort_values()
    d2.plot.pie(y='Histodata', legend=False).set_ylabel('')

    return dMQ

예제 #12

0

파일 보기

def plotsMS(ms_psmopen, range_int, zerovar, minbin, binsize, binrange, nomi):
    # Pie plot
    print('MSFragger Plot Parameters: min. modification: '+ str(range_int) + \
          '; removed '+str(zerovar)+ 'bins around zero'+'; min. binsize: '\
          +str(minbin)+'; binsize: '+str(binsize))
    plt.figure()
    dfMS = ms_psmopen.groupby('Observed Modifications')\
    .size().reset_index(name='No: of Modifications')

    range_intrest = range_int
    df2 = dfMS[dfMS['No: of Modifications'] > range_intrest]
    df3 = dfMS[dfMS['No: of Modifications'] < range_intrest]

    others = 0

    df2 = df2.reset_index(drop=True)
    df3 = df3.reset_index(drop=True)
    for x in range(len(df3)):
        others = others + df3['No: of Modifications'][x]

    df2.loc[len(df2)] = ['Others', others]
    df2.set_index('Observed Modifications', inplace=True)
    df2.drop('Unknown', inplace=True)
    df2 = df2['No: of Modifications'].sort_values()
    df2.plot.pie(y='No: of Modifications', legend=False).set_ylabel('')

    # Create new Bar Plot for Mass Bins ??not accurate == WHY??
    plt.figure()
    stepsize = binsize
    # set x bin range here
    bins = np.arange(-binrange, binrange, stepsize)
    var = round(len(bins) / 2)

    if (nomi == True):
        sorted = round(ms_psmopen.sort_values('Adjusted Delta Mass'))
    else:
        sorted = ms_psmopen.sort_values('Adjusted Delta Mass')

    hist, bin_edges = np.histogram(sorted['Adjusted Delta Mass'].values,
                                   bins=bins)
    hist[(var)] = 0  # remove unknown modifications around 0
    # adjusted Delta mass
    # --> prob. no modification
    #hist[hist>0] = np.log2(hist[hist>0])
    plt.xlabel('Mass Bins', fontsize=15)
    plt.ylabel('log-scale quantity', fontsize=13)

    plt.bar(bin_edges[:-1], hist, log=True, width=1)
    #    plt.ylim([0,np.log(10000000000000)])
    plt.show()

    # Pie PLot of Mass Bins

    # do only once per program execution
    hist = np.append(hist, 0)

    plt.figure()
    histMS = {'Histodata': hist, 'Bins': bins}
    dMS = pd.DataFrame(histMS)
    d2 = dMS[dMS['Histodata'] > minbin]

    round_bin = np.around(d2['Bins'], decimals=2)
    round_bin = round_bin.reset_index(drop=True)

    for x in range(len(round_bin)):
        round_bin[x] = str(round_bin[x])\
        + ' to ' + str(round((round_bin[x] + stepsize),2)) + ' bin'

    d2.set_index(round_bin, inplace=True)
    d2 = d2['Histodata'].sort_values()
    d2.plot.pie(y='Histodata', legend=False).set_ylabel('')
    return dMS