newprocessed_reorder = newprocessed[[ 'file_id', 'filename', 'source', 'visit', 'subject', 'SV1wk20', 'SV2wk20', 'SV1mo20', 'SV6mo20', 'SV1yr20', 'SV3yr20', 'SV1wk100', 'SV2wk100', 'SV1mo100', 'SV6mo100', 'SV1yr100', 'SV3yr100', 'raw_processed_date' ]] # check for dups dups = newprocessed.loc[newprocessed_reorder.duplicated( subset=['subject', 'visit'], keep=False)] newprocessed_reorder.to_csv(processed_file, index=False) # this is in the behavioral data/snapshots/ePrimeDD/raw_allfiles_in_box/ folder...is not the curated BDAS file nor is it officially a snapshot - just keeping a record of the raw unprocessed download # box.upload_file(processed_file,82670454492)# first run had to upload # file - subsequent runs just update box.update_file(495494179106, processed_file) shutil.rmtree(box.cache) def folderlistcontents(folderslabels, folderslist): bdasfilelist = pd.DataFrame() bdasfolderlist = pd.DataFrame() for i in range(len(folderslist)): print('getting file and folder contents of box folder ' + folderslabels[i]) # foldercontents generates two dfs: a df with names and ids of files # and a df with names and ids of folders subfiles, subfolders = foldercontents(folderslist[i]) bdasfilelist = bdasfilelist.append(subfiles) bdasfolderlist = bdasfolderlist.append(subfolders)
# sub4.to_csv(processed_file,index=False) - original initialization of processed file required 'to_csv' # cat these to the processed file newprocessed = pd.concat( [processed, files4process], axis=0, sort=True, ) newprocessed.to_csv(processed_filename, index=False) # this is in the behavioral data/snapshots/Q/raw_allfiles_in_box/ folder...is not the curated BDAS file - just keeping a record of the raw unprocessed download # box.upload_file(processed_file,76432368853) first run had to upload file # - subsequent runs just update box.update_file(processfile_id, processed_filename) # In[78]: shutil.rmtree(box.cache) # In[ ]: # In[99]: cleaned = pd.read_excel(box.readFile(cleanestdata)) # In[212]: combined = pd.concat([cleaned, files4process], sort=False, ignore_index=True)