import pycurl import sys import shutil from openpyxl import load_workbook import pandas as pd import download.box from io import BytesIO import numpy as np import subprocess from scipy import stats redcapconfigfile="/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/.boxApp/redcapconfig.csv" from download.box import LifespanBox box_temp='/home/petra/UbWinSharedSpace1/boxtemp' #location of local copy of curated data box = LifespanBox(cache=box_temp) verbose = True snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') #catfromdate=max of last run--'2019-06-17' #ped file will have all the nda vars from intradb, plus randomization status and extrainfo='UnrelatedHCAHCD_w_STG_Image_and_pseudo_GUID09_27_2019.csv' dev_peds_path='/home/petra/UbWinSharedSpace1/redcap2nda_Lifespan2019/Dev_pedigrees/' hcplist=dev_peds_path+extrainfo pathout="/home/petra/UbWinSharedSpace1/redcap2nda_Lifespan2019/HCD_crosswalk_docs/prepped_structures" fieldlist=['racial','ethnic','hand1','hand2','hand3','hand4','hand5','hand6','hand7','hand8', 'hand_total','iihandwr','iihandth','iihandsc','iihandto','iihandkn','iihandsp', 'iihandbr','iihandma','iihandbo','iihandfk','iihandey']
import subprocess import pandas as pd from config import config from download.box import LifespanBox from download.pennCNP import PennCNP from download.redcap import Redcap # verbose = False verbose = True snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') ksads_cache_path = config['dirs']['cache']['ksads'] # connect to Box box = LifespanBox(cache=ksads_cache_path, config_file=config['box']) site_file = config['PennCNP']['snapshot'] def loadYaml(filename): if not os.path.exists(filename): return None with open(filename, 'r') as fd: return yaml.load(fd, Loader=yaml.SafeLoader) def main(): parser = argparse.ArgumentParser( description="Downloads the data from PennCNP") user_group = parser.add_mutually_exclusive_group()
import os import sys import shutil import pandas from download.box import LifespanBox """ """ verbose = True root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) cache_space = os.path.join(root_dir, 'cache', 'toolbox') combined_path = os.path.join(cache_space, 'Toolbox_Combined.csv') toolbox_folder_id = 42902161768 box = LifespanBox(cache=cache_space) # label_errors = [] # instrument_errors = [] # hca_path = os.path.join(root_dir, 'store', 'toolbox-hca-instruments.txt') # with open(hca_path) as f: # hca_instruments = f.read().splitlines() # hcd_path = os.path.join(root_dir, 'store', 'toolbox-hcd-instruments.txt') # with open(hcd_path) as f: # hcd_instruments = f.read().splitlines() # par_path = os.path.join(root_dir, 'store', 'toolbox-parent-instruments.txt') # with open(par_path) as f: # par_instruments = f.read().splitlines()
import os, datetime import pandas as pd from download.box import LifespanBox import sys verbose = True #verbose = False snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') #Two types of files to curate...the so called raw data from which scores are generated and the scores themeselves. #connect to Box (to get latest greatest curated stuff) box_temp = '/home/petra/UbWinSharedSpace1/boxtemp' #location of local copy of curated data box = LifespanBox(cache=box_temp) redcapconfigfile = "/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/.boxApp/redcapconfig.csv" #removelist=pd.read_csv(os.path.join(box_temp,'RemoveFromCurated_perTrello19May2020.csv')) removelist = pd.read_csv( os.path.join(box_temp, 'RemoveFromCurated_perTrello27May2020.csv')) #validpair(pin='HCD0007014_V1') #get list of filenames ################################################################################################## WashuD = 84801037257 curated = 82804015457 wudfiles, wudfolders = foldercontents(WashuD) #wudfiles2, wudfolders2=folderlistcontents(wudfolders.foldername,wudfolders.folder_id) #wudfiles=pd.concat([wudfiles,wudfiles2],axis=0,sort=True) data4process = wudfiles.loc[(wudfiles.filename.str.contains('aw_') == True) |
########################################################################## ####initiate data that is required for both scores and raw data types##### import os import pandas as pd from download import redcap from download.box import LifespanBox verbose = True snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') root_cache = '/data/intradb/tmp/box2nda_cache/' # dont delete cache at the end of this program until endpoint machine is # back up and running cache_space = os.path.join(root_cache, 'endpointmachine/lifespan') box = LifespanBox(cache=cache_space) root_store = '/home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/' # this will be the place to save any snapshots on the nrg servers store_space = os.path.join(root_store, 'toolbox') try: os.mkdir(store_space) # look for store space before creating it here except BaseException: print("store already exists") # prep basic redcap data ####################3 # need this so sites and studies can be assigned to curated data rows (if missing). # Data cant be split by sites unless this info included somehow. moredata = redcap.getfullredcapdata() # set these vars to missing because they belong to child of parent and could conflict # (different QC pgrograms look for this type of inconsistency between REDCap Databases)
def __init__(self, box=None): if box is None: box = LifespanBox() self.box = box
root_store = '/home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/' # this will be the place to save any snapshots on the nrg servers store_space = os.path.join(root_store, 'eprime') try: os.mkdir(store_space) # look for store space before creating it here except BaseException: print("store already exists") processed_file = os.path.join(store_space, 'ProcessedBoxFiles_AllRawData_Eprime.csv') available_box_files = os.path.join(cache_space, 'AllBoxFiles_Eprime.csv') # generate the box object which contains the necessary client config to # talks to box, and sets up the cache space box = LifespanBox(cache=cache_space) # this section will # necessary to search folders with # generate list of all files in Q directories and identify those that dont follow pattern # each of the site folders contains individual folders sitefolderslabels = ["WUHCD", "UCLAHCD"] # ,"UMNHCASUB"] sitefolderlist = [41361544018, 61956482658] # get folder contents for all the sites including the known subfolder of individuals folders # folderlistcontents generates two dfs: a df with names and ids of files # and a df with names and ids of folders superfilelist, superfolderlist = folderlistcontents( sitefolderslabels, sitefolderlist) # 2378 files and 1 folders as of 5/22/2019 if (superfilelist.shape[0] == 0):
import pandas as pd import numpy as np from config import config from download import redcap from download.box import LifespanBox from download.redcap import Redcap verbose = True snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') ksads_cache_path = config['dirs']['cache']['ksads'] # this will be the place to save any snapshots on the nrg servers store_space = config['dirs']['store']['ksads'] # connect to Box box = LifespanBox(cache=ksads_cache_path, config_file=config['box']) redcap = Redcap(config['redcap']['config']) assessments = config['Assessments'] sites = config['Sites'] # verbose = False # snapshot folder (used to be the combined folder) ksads_snapshotfolderid = config['ksads_snapshotfolderid'] snapshotQCfolder = config['snapshotQCfolder'] # download one of the identical key files which contain the labels for # all the numbered questions in KSADS cachekeyfile = box.downloadFile(config['cachekeyfile']) def main():
# In[3]: verbose = True snapshotdate = datetime.datetime.today().strftime('%Y-%m-%d') cache_space = config['dirs']['cache']['qint'] store_space = config['dirs']['store']['qint'] processed_filename = os.path.join( store_space, 'ProcessedBoxFiles_AllRawData_Qinteractive.csv') combined_filename = os.path.join( store_space, 'HCA-HCD_Allsites_QandRAVLT_%s.xlsx' % snapshotdate) available_box_files = os.path.join(cache_space, 'AllBoxFiles_Qinteractive.csv') # In[4]: box = LifespanBox(cache=cache_space) # In[5]: sites = { 18446355408: 'WUHCD', 18446433567: 'WUHCA', 18446318727: 'UMNHCD', 18446298983: 'UMNHCA', 18446352116: 'UCLAHCD', 18446404271: 'UCLAHCA', 18446321439: 'HARVHCD', 18446404071: 'MGHHCA', # 47239506949: 'UMNHCASUB' } bdas_folders = {75755393630: 'BDAS_HCD', 75755777913: 'BDAS_HCA'}
import os, datetime import pandas as pd from download.box import LifespanBox verbose = True #verbose = False snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') #Two types of files to curate...the so called raw data from which scores are generated and the scores themeselves. #connect to Box (to get latest greatest curated stuff) box_temp='/home/petra/UbWinSharedSpace1/boxtemp' #location of local copy of curated data box = LifespanBox(cache=box_temp) redcapconfigfile="/home/petra/UbWinSharedSpace1/ccf-nda-behavioral/PycharmToolbox/.boxApp/redcapconfig.csv" #start with data that is result of extensive QC effort from sites. #keep track of expected and observed IDs #curate list of TBX issues. #pull in data (by ID) that not on list of issues #get list of filenames ########################## Harvard=84800505740 harvardfiles, harvardfolders=foldercontents(Harvard) harvardfiles2, harvardfolders2=folderlistcontents(harvardfolders.foldername,harvardfolders.folder_id) harvardfiles=pd.concat([harvardfiles,harvardfiles2],axis=0,sort=True) data4process=harvardfiles.loc[harvardfiles.filename.str.contains('Data')==True] scores4process=harvardfiles.loc[harvardfiles.filename.str.contains('Score')==True] box.download_files(data4process.file_id)
import pandas as pd from download.box import LifespanBox from download.redcap import Redcap from config import config redcap = Redcap() verbose = True snapshotdate = datetime.datetime.today().strftime('%m_%d_%Y') columnnames = config['QIntColumns'] cache_space = config['dirs']['cache']['qint'] store_space = config['dirs']['store']['qint'] # connect to Box box = LifespanBox(cache=cache_space) # snapshot folder (used to be the combined folder) q_snapshotfolderid = 48203213208 snapshotQCfolder = 76434619813 cleanestdata = 465568117756 # %% baseclean = pd.read_excel(box.readFile(cleanestdata)) basecleanexcluded = baseclean.loc[baseclean.source == 'perm-missing'] baseclean = baseclean.loc[baseclean.select_4clean == 1] baseclean.row = baseclean.row.str.replace('-', '') asslist = baseclean.groupby('assessment').count() asslist.reset_index(inplace=True)
cache_space = os.path.join(root_cache, 'eprime') try: os.mkdir(cache_space) except BaseException: print("cache already exists") root_store = '/home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/' # this will be the place to save any snapshots on the nrg servers store_space = os.path.join(root_store, 'eprime') try: os.mkdir(store_space) # look for store space before creating it here except BaseException: print("store already exists") # connect to Box box = LifespanBox(cache=cache_space) redcap = Redcap('../tmp/.boxApp/redcapconfig.csv') # snapshot folder (used to be the combined folder) e_snapshotfolderid = 82670538107 snapshotQCfolder = 76434619813 slimfolder = 82670800769 # (for data dictionaries) cleanestdata = 495490047901 # Coordinator monthly update process is to run eprime_getraw.py to 'download' all of the individual records from box # UCLA and WU upload folders for individual subjects...the python program converts the text files in these folders into rows # of data for a given subject. Coordinator role to check for new rows. The eprime getraw program appends new data # to the ProcessedBoxFiles_AllRawData_Eprime.csv file under snapshots/ePrimeDD/raw_allfiles_in_box # Note, this box file is also synced with /home/shared/HCP/hcpinternal/ccf-nda-behavioral/store/eprime/ProcessedBoxFiles_AllRawData_Eprime.csv # File ids in the store are getting rounded and converted when saved to box...so if you need fileids, grab from store. # after running eprime_getraw.py, open the the current (and cumulatively cleaned) 'database' under BDAS/