feature_schema_file = sys.argv[ 11] # full path of the file listing columns and types # for each feature file cleanup_flag = sys.argv[ 12] # indicates if cleanup should be applied (flag != 0) argument_path = sys.argv[ 13] # full path of the folder containing jsons files for # additional arguments for extracting features of each # table. It is assumed that file names are as follow: # xxxx-feature-additional.json where xxxx is the feature # name utils.createFolder(job_path, 0o744) utils.createFolder(feature_path, 0o744) participants = utils.readList(participant_file) for id in participants: # create folder to store extracted features for each participant id result_dir = '{result_directory}/pid{pid}'.format( result_directory=feature_path, pid=id) utils.createFolder(result_dir, 0o744) tables = utils.readList(table_file) for table in tables: job_dir = '{job_directory}/{tbl}'.format(job_directory=job_path, tbl=table) utils.createFolder(job_dir, 0o744) for id in participants: for table in tables: # create the job for features associated with each device
2] # full path of the folder where *.job files are stored python_cmd = sys.argv[ 3] # python command to run the count aggreagtion script code_path = sys.argv[ 4] # full static path to where the count aggregation code is result_path = sys.argv[ 5] # full path of the result root to store aggregate counts for each table table_path = sys.argv[ 6] # full path of table data root containing sub-folders for each # pid and *.txt files for each counts per participant per table # (fine if relative to code base) participant_file = sys.argv[ 7] # full path of the csv file listing 3-digit zero-padded # participant id's start_date = sys.argv[8] # lower end of the date range to aggregate counts end_date = sys.argv[9] # upper end of the date range to aggregate counts utils.createFolder(job_path, 0o744) utils.createFolder(result_path, 0o744) tables = utils.readList(table_file) for table in tables: # create the job for aggregating counts across participants for table job_content = jobContent(code_path, python_cmd, table_path, table, participant_file, start_date, end_date, result_path) job_file = '{job_directory}/{tbl}.job'.format(job_directory=job_path, tbl=table) utils.createJob(job_file, job_content)
table_path = sys.argv[ 1] # full path of the folder containing *.txt count data per participant table_name = sys.argv[2] # name of the table participant_file = sys.argv[ 3] # full path of the file listing zero-padded 3-digit participant id's start_date = sys.argv[4] # starting date to aggregate counts end_date = sys.argv[5] # end date to aggregate counts result_path = sys.argv[ 6] # full path of the folder to store counts aggregated across participants # for the give table dates = pd.DataFrame(data=pd.date_range(start=start_date, end=end_date), columns=['date']) participants = utils.readList(participant_file) for id in participants: table_file = '{0}/pid{1}/{2}.txt'.format(table_path, id, table_name) data = pd.read_csv(table_file, header=0, dtype={ 'date': str, 'record_num': int }, parse_dates=['date'], sep='\t', lineterminator='\n', encoding="ISO-8859-1") dates = pd.merge(dates, data, on='date', how='left') dates.rename(columns={'record_num': id}, inplace=True)