def df2groupedCorpus(df, groupingKey='tape', call='note', sep='\s'): """returns the feature list from a dataframe grouping the value of the grouping key Parameters ---------- df: pandas dataframe groupingKey: str call: str Returns -------- X_str: list of strings (n_instances) feature matrix to use as input of CountVectorizer y: list, (n_instances) labels of the instances from the grouping key """ ## group dataframe df_dict = daT.dictOfGroupedDataFrames(df, groupingKey=groupingKey) X_str = [] # feature list y = [] # group label for ky in df_dict.keys(): y.append(ky) # group X_str.append(sep.join(df_dict[ky][call].values)) return X_str, y
import pylotwhale.utils.dataTools as daT import pylotwhale.NLP.myStatistics_beta as mysts # # Load csv with sequences # In[9]: pDir = os.path.dirname(os.path.abspath(__file__)) cfile = os.path.join(pDir, 'data/sequenceFiles_df.txt') #cfile = '/home/florencia/profesjonell/bioacoustics/Kurt/mice/data/sequenceFiles_df.txt' df0 = pd.read_csv(cfile) # In[13]: name_df = daT.dictOfGroupedDataFrames(df0, groupingKey='name') print('TEST', len(name_df)) def test_data(): assert (len(df0) == 25282) assert (len(name_df) == 67) assert ({1, 2, 3} == set(df0['genecode'])) # In[14]: len(name_df) # # Call from al mice
df.loc[i, "segment"] = "{}_{}".format(t, j) break elif col["Dtag"] > 1: # elements missing --> new segment j += 1 df.loc[i, "segment"] = "{}_{}".format(t, j) continue else: print("else! ", i, col["Dtag"]) break # ### Separate data frames by segment and drop nans # The nans, come from all the non labelled items in the previous step (segment assignation) and they correspond to missing calls and new tapes. # In[11]: tape_df0 = daT.dictOfGroupedDataFrames(df, groupingKey="segment") # filter df with only one element and segments nan segment tape_df = {k: v for k, v in tape_df0.items() if (len(v) > 1 and k != np.nan)} # In[12]: def test_segment_tape_df0(): assert len(tape_df0) == 232 assert len(tape_df) == 175 test_segment_tape_df0() # ## Distrubution of N-grams as a function of $\tau$
import pylotwhale.NLP.myStatistics_beta as mysts import pylotwhale.NLP.tempoTools as tT import pylotwhale.utils.netTools as nT # # Load df # In[3]: pDir = os.path.dirname(os.path.abspath(__file__)) df_file = os.path.join(pDir, 'data/groupB_annotations_df.csv') #df_file = '/home/florencia/profesjonell/bioacoustics/noriega2018sequences/data/groupB_annotations_df.csv' # load df = pd.read_csv(df_file) #; df= df0 # tape separation tapedf = daT.dictOfGroupedDataFrames(df) def test_data(): # N_calls assert (len(df) == 425) # N_call types assert (len(set(df['call'])) == 22) # tapes set assert (set(df['tape'].values) == set([113, 114, 115, 111])) assert (set(tapedf.keys()) == set([113, 114, 115, 111])) # # Bigrams and randomisations test # # Define the **sequences**