def buildProjects(config): #### Read user's settings.yml file, which will be used to get API tokens and URLs. api_settings = ApiSettings() ## Build a list of "projects" - dicts which store data and settings for the project. projects = config["projects"] ## Verify the settings for each project. for project in projects: code_name = project["code_name"] # Get args to pass to exportRecords. if (not "exportRecords_args" in project) or (project["exportRecords_args"] is None): project["exportRecords_args"] = {} # If use_getIPSSIDs is True, get list of record IDs to export. if project["options"]["use_getIPSSIDs"]: # If use_getIPSSIDs is True, but no options provided, raise warning. if (not "getIPSSIDs_args" in project) or (project["getIPSSIDs_args"] is None): print "Warning: in project '" + code_name + "', 'use_getIPSSIDs' is True, but 'getIPSSIDs_args' not provided for project. Exporting all record IDs from project." record_id_list = None else: getIPSSIDs_args = project["getIPSSIDs_args"] record_id_list = getIPSSIDs(**getIPSSIDs_args) # If exportRecords_args has an entry for record_id_list, but use_getIPSSIDs is True, raise warning. if (project["options"]["use_getIPSSIDs"]) and ( "record_id_list" in project["exportRecords_args"]): print "Warning: in project '" + code_name + "', the specified 'record_id_list' will be ignored, since 'use_getIPSSIDs' is True." # Overwrite the record_id_list argument in exportRecords_args project["exportRecords_args"]["record_id_list"] = record_id_list ## Get args to pass to exportRecords. If key does not exist, or it is not set to a value, set it to an empty dict (i.e. exportRecords_args = project[ "exportRecords_args"] # has a value (possibly {}). # Convert exportRecords_args arguments to strings as needed. convert_to_strings = ["fields", "forms", "events", "record_id_list"] for arg in convert_to_strings: if arg in exportRecords_args.keys(): if (exportRecords_args[arg] == 'None' ): # these arguments could be lists or None # Convert string 'None' to Python None. exportRecords_args[arg] = None else: # Convert list to list of strings. Currently, list might contain integers etc. new_list = [str(val) for val in exportRecords_args[arg]] exportRecords_args[arg] = new_list ## Get API credentials for current project. api_url, api_key, code_name = api_settings.getApiCredentials( code_name=code_name) project["api_url"] = api_url project["api_key"] = api_key ## Export requested data for current project data_csv = exportRecords(api_url, api_key, format="csv", **exportRecords_args) data_csv_file = StringIO(data_csv) data_df = pandas.read_csv(data_csv_file, dtype=unicode, encoding='utf-8').fillna('') project["chunks"] = [ data_df ] # this list of dataframes will be broken into pieces, each piece containing data to be placed in a different tab. ## Retrieve project settings and add them to the dict for the current project pycap_project = redcap.Project(api_url, api_key) def_field = pycap_project.def_field project_info = exportProjectInfo(api_url, api_key) longitudinal = bool(project_info["is_longitudinal"]) repeating = bool(project_info["has_repeating_instruments_or_events"]) events = getEvents(api_url, api_key, quiet=True) metadata_raw = pycap_project.export_metadata() form_event_mapping = exportFormEventMapping(pycap_project, longitudinal) repeating_forms_events = exportRepeatingFormsEvents( api_url, api_key, repeating) forms = exportFormsOrdered(api_url, api_key) form_repetition_map = createFormRepetitionMap(longitudinal, repeating, form_event_mapping, repeating_forms_events, forms) metadata = parseMetadata(pycap_project.def_field, project_info, longitudinal, repeating, events, metadata_raw, form_event_mapping, repeating_forms_events, forms, form_repetition_map, write_branching_logic_function=False) project["pycap_project"] = pycap_project project["def_field"] = def_field project["project_info"] = project_info project["longitudinal"] = longitudinal project["repeating"] = repeating project["events"] = events project["form_event_mapping"] = form_event_mapping project["repeating_forms_events"] = repeating_forms_events project["forms"] = forms project["form_repetition_map"] = form_repetition_map project["metadata"] = metadata # Create dict which maps each form to a list of events containing that form. if longitudinal: form_to_events_dict = {} for form_event_entry in form_event_mapping: form = form_event_entry['form'] event = form_event_entry['unique_event_name'] if (not form in form_to_events_dict): form_to_events_dict[form] = [event] else: form_to_events_dict[form].append(event) else: form_to_events_dict = None project["form_to_events_dict"] = form_to_events_dict ## Build lists of variables which appear in the export data. # columns which uniquely identify a row primary_key = [def_field] if project["longitudinal"]: primary_key.append("redcap_event_name") if project["repeating"]: primary_key.append("redcap_repeat_instrument") primary_key.append("redcap_repeat_instance") project["primary_key"] = primary_key primary_key_and_dag = primary_key if ("redcap_data_access_group" in data_df.columns): primary_key_and_dag.append("redcap_data_access_group") project["primary_key_and_dag"] = primary_key_and_dag # form_complete fields form_complete_fields = [ field for field in data_df.columns if ((field.endswith("_complete")) and (not field in metadata) and ( not field in primary_key) and ( not field == "redcap_data_access_group")) ] project["form_complete_fields"] = form_complete_fields # data fields data_fields = [ field for field in data_df.columns if ((not field in primary_key + form_complete_fields) and ( not field == "redcap_data_access_group")) ] project["data_fields"] = data_fields return projects
project_repeating) forms = exportFormsOrdered(api_url_ipss, api_key_ipss) form_repetition_map = createFormRepetitionMap(project_longitudinal, project_repeating, form_event_mapping, repeating_forms_events, forms) metadata = parseMetadata(def_field, project_info, project_longitudinal, project_repeating, events, metadata_raw, form_event_mapping, repeating_forms_events, forms, form_repetition_map) # Export records for non-registry-only patients. #records_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids_arch, label=True) records = exportRecords(api_url_ipss, api_key_ipss, record_id_list=record_ids, label_overwrite=False, label=True) #records_post_2014 = exportRecords(api_url_ipss, api_key_ipss, record_id_list=record_ids_post_2014, label_overwrite=False, label=True) records_post_2014 = [ row for row in records if (row["ipssid"] in record_ids_post_2014) ] #records_non_sk = exportRecords(api_url_ipss, api_key_ipss, record_id_list=record_ids_non_sk, label_overwrite=False, label=True) records_non_sk = [ row for row in records if (row["ipssid"] in record_ids_non_sk) ] records_psom = exportRecords(api_url_psom, api_key_psom, record_id_list=record_ids_psom, label_overwrite=False, label=True)
def getPatientInfo(url_arch, url_ipss, key_arch, key_ipss): ## Get list of record IDs for each project. Exclude registry-only patients. Exclude patients with unknown stroke type. # record_ids_arch = getRecordIDList(url_arch, key_arch) # registry_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids_arch, fields=["registry"], events=["acute_arm_1"]) # for row in registry_arch: # if (row["registry"] == "1"): # record_ids_arch.remove(row["pk_patient_id"]) # record_ids_ipss = getRecordIDList(url_ipss, key_ipss) # registry_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids_ipss, fields=["substud"], events=["acute_arm_1"]) # for row in registry_ipss: # if (row["substud___8"] == "1"): # record_ids_ipss.remove(row["ipssid"]) # record_ids_ipss_only = [record_id for record_id in record_ids_ipss if not record_id in record_ids_arch] # for record_id in record_ids_arch: # if (not record_id in record_ids_ipss): # print "Record with ID", record_id, "in Archive, but not in IPSS" # Create one list of record ID which are non-registry and have known stroke type. record_ids = getIPSSIDs(inc_registry_only=False, inc_unknown_stroke_type=False) ## Create dict with patient information: {record_id: {dag:"...", enroll_date:"...", ...} } patient_info = {} for record_id in record_ids: # add item (another dict) for each patient in the Archive patient_info[record_id] = {} # patient_info[record_id]["in_arch"] = True # if (record_id in record_ids_ipss): # patient_info[record_id]["in_ipss"] = True # boolean describing presence of record in Archive # else: # patient_info[record_id]["in_ipss"] = False # boolean describing presence of record in IPSS # for record_id in record_ids_ipss_only: # add item (another dict) for each patient in the IPSS that has not yet been added. # patient_info[record_id] = {} # patient_info[record_id]["in_arch"] = False # patient_info[record_id]["in_ipss"] = True ## Get enrolment date for each record. # Archive - Use 'dateofentry', then 'visit_date". # print "Project : Archive" dateofentry_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids, fields=["dateofentry"], events=["acute_arm_1"], validate=False) # if (len(record_ids_arch) != len(dateofentry_arch)): # look for record id missing from exported data # for record_id in record_ids_arch: # id_in_data = False # for row in dateofentry_arch: # if (row["pk_patient_id"] == record_id): # id_in_data = True # break # if (not id_in_data): # print "Record with ID "+str(record_id)+" not found in exported data" # num_missing = 0 for row in dateofentry_arch: if (row["dateofentry"] == ""): # num_missing += 1 pass else: if ("enroll_date" in patient_info[row["pk_patient_id"]]): print "This record was counted twice: " + str( row["pk_patient_id"]) continue patient_info[row["pk_patient_id"]]["enroll_date"] = int( row["dateofentry"][:4]) num_missing = len( [id for id in record_ids if (not "enroll_date" in patient_info[id])]) # print "Field used : dateofentry" # print "Number missing : ", num_missing record_ids_leftover = [ id for id in record_ids if (not "enroll_date" in patient_info[id]) ] visit_date_leftover = exportRecords(url_arch, key_arch, record_id_list=record_ids_leftover, fields=["visit_date"], events=["acute_arm_1"], validate=False) # num_missing = 0 for row in visit_date_leftover: if (row["visit_date"] == ""): # num_missing += 1 pass else: if ("enroll_date" in patient_info[row["pk_patient_id"]]): print "This record was counted twice: " + str( row["pk_patient_id"]) continue patient_info[row["pk_patient_id"]]["enroll_date"] = int( row["visit_date"][:4]) num_missing = len( [id for id in record_ids if (not "enroll_date" in patient_info[id])]) # print "Field used : visit_date" # print "Number missing : ", num_missing # IPSS - use 'dateentered' (works for all but 6 patients). # print # print "Project : IPSS" record_ids_leftover = [ id for id in record_ids if (not "enroll_date" in patient_info[id]) ] dateentered_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids_leftover, fields=["dateentered"], events=["acute_arm_1"], validate=False) # if (len(record_ids_ipss_only) != len(dateentered_ipss)): # look for record id missing from exported data # for record_id in record_ids_ipss_only: # id_in_data = False # for row in dateentered_ipss: # if (row["ipssid"] == record_id): # id_in_data = True # break # if (not id_in_data): # print "Record with ID "+str(record_id)+" not found in exported data" # num_missing = 0 for row in dateentered_ipss: if (row["dateentered"] == ""): # num_missing += 1 pass else: if ("enroll_date" in patient_info[row["ipssid"]]): print "This record was counted twice: " + str(row["ipssid"]) continue patient_info[row["ipssid"]]["enroll_date"] = int( row["dateentered"][:4]) num_missing = len( [id for id in record_ids if (not "enroll_date" in patient_info[id])]) # print "Field used : dateentered" # print "Number missing : ", num_missing enroll_dates = set() for id, info in patient_info.iteritems(): if ('enroll_date' in info): enroll_dates.add(info['enroll_date']) if (not info['enroll_date'] in range(2003, 2020)): print "Record enroll date outside [2003, 2019]:", id else: print "Record with no enrolment date:", id # print "enroll_dates:", sorted(list(enroll_dates)) ## Get DAG for each record: dags_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids, fields=["pk_patient_id"], validate=False) dags_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["ipssid"], validate=False) for row in dags_arch: record_id = row["pk_patient_id"] dag = row["redcap_data_access_group"] patient_info[record_id]["dag"] = dag for row in dags_ipss: record_id = row["ipssid"] dag = row["redcap_data_access_group"] if (not "dag" in patient_info[record_id]) or ( patient_info[record_id]["dag"] == ""): # add DAG from IPSS if not added already patient_info[record_id][ "dag"] = dag # overwriting DAG for records in Archive should not be a problem. # for id in patient_info: # if (not "dag" in patient_info[id]) or (patient_info[id]["dag"] == ""): # print "Record with ID", id, "does not have a DAG assigned" ## Get stroke type for each patient. # Need to decide how we want to break this down further. # stroke_type_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids_arch, fields=["ais", "csvt", "pperi", "preart", "other_stroke", "age_at_event"], events=["acute_arm_1"]) stroke_type_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=[ "chais", "chcsvt", "neoais", "neocsvt", "ppis", "ppcsvt", "pvi", "preart", "othcond" ], events=["acute_arm_1"]) for record_id in patient_info: patient_info[record_id]["stroke_type"] = {} patient_info[record_id]["stroke_type"]["neo_ais"] = "2" patient_info[record_id]["stroke_type"]["neo_csvt"] = "2" patient_info[record_id]["stroke_type"]["child_ais"] = "2" patient_info[record_id]["stroke_type"]["child_csvt"] = "2" patient_info[record_id]["stroke_type"]["pp_ais"] = "2" patient_info[record_id]["stroke_type"]["pp_csvt"] = "2" patient_info[record_id]["stroke_type"]["pp_vi"] = "2" patient_info[record_id]["stroke_type"]["art"] = "2" patient_info[record_id]["stroke_type"]["other"] = "2" for row in stroke_type_ipss: # 0 - no, 1 - yes, 2 - unknown record_id = row["ipssid"] # neonatal AIS patient_info[record_id]["stroke_type"]["neo_ais"] = row["neoais___1"] # neonatal CSVT patient_info[record_id]["stroke_type"]["neo_csvt"] = row["neocsvt___1"] # child AIS patient_info[record_id]["stroke_type"]["child_ais"] = row["chais___1"] # child CSVT patient_info[record_id]["stroke_type"]["child_csvt"] = row[ "chcsvt___1"] # presumed perinatal AIS patient_info[record_id]["stroke_type"]["pp_ais"] = row["ppis___1"] # presumed perinatal CSVT patient_info[record_id]["stroke_type"]["pp_csvt"] = row["ppcsvt___1"] # presumed perinatal VI patient_info[record_id]["stroke_type"]["pp_vi"] = row["pvi___1"] # arteriopathy patient_info[record_id]["stroke_type"]["art"] = row["preart___1"] # other patient_info[record_id]["stroke_type"]["other"] = row["othcond___1"] # Look for patients without an identified stroke type. record_ids_with_unidentified_stroke_type = [] for id, record in patient_info.iteritems(): identified_type = False for stroke_type, value in record["stroke_type"].iteritems(): if (value == "1"): identified_type = True break if (not identified_type): # print "Record with ID", id, "has an unidentified stroke type." record_ids_with_unidentified_stroke_type.append(id) # Check if stroke type can be identified in Archive instead. # stroke_type_arch_leftover = exportRecords(url_arch, key_arch, record_id_list=record_ids_with_unidentified_stroke_type, fields=["ais", "csvt", "pperi", "preart", "other_stroke", "age_at_event"], events=["acute_arm_1"]) # for row in stroke_type_arch_leftover: # print row["pk_patient_id"], row["ais"], row["csvt"], row["pperi"], row["preart"], row["other_stroke"]#, row["age_at_event"] # stroke_type_found = False # if (row["ais"] == "1") and (row["age_at_event"] == "0"): # patient_info[row["pk_patient_id"]]["stroke_type"]["neo_ais"] = "1" # stroke_type_found = True # if (row["csvt"] == "1") and (row["age_at_event"] == "0"): # patient_info[row["pk_patient_id"]]["stroke_type"]["neo_csvt"] = "1" # stroke_type_found = True # if (row["ais"] == "1") and (row["age_at_event"] == "1"): # patient_info[row["pk_patient_id"]]["stroke_type"]["child_ais"] = "1" # stroke_type_found = True # if (row["csvt"] == "1") and (row["age_at_event"] == "1"): # patient_info[row["pk_patient_id"]]["stroke_type"]["child_csvt"] = "1" # stroke_type_found = True # if (row["preart"] == "1"): # patient_info[row["pk_patient_id"]]["stroke_type"]["art"] = "1" # stroke_type_found = True # if (row["other_stroke"] == "1"): # patient_info[row["pk_patient_id"]]["stroke_type"]["other"] = "1" # stroke_type_found = True # if stroke_type_found: # record_ids_with_unidentified_stroke_type.remove(row["pk_patient_id"]) # Print some stats on the acquired patient information. num_no_year = 0 num_no_dag = 0 for record_id, record in patient_info.iteritems(): if (record["dag"] == ""): num_no_dag += 1 if (not "enroll_date" in record): num_no_year += 1 print "Number of duplicated record IDs:", len(record_ids) - len( set(record_ids)) print "Number of unique record IDs:", len(set(record_ids)) print "Number of record IDs in patient_info:", len(patient_info) print "Number of records with no DAG:", num_no_dag print "Number of records with no enrolment date:", num_no_year print "Number of records with unidentified stroke type:", len( record_ids_with_unidentified_stroke_type) return patient_info
def reportPatientInfo(patient_info, out_dir, path_dag_info): ## Miscellaneous items used in all of the enrolment reports min_year = 2003 max_year = 2020 #2019 year_list = range(min_year, max_year + 1) records_ipss = exportRecords(url_ipss, key_ipss, fields=["ipssid"]) dags = getDAGs(records_ipss)[1] # Put "Unassigned" at end of list. dags_old = dags dags = sorted(dags_old)[1:] dags.extend(sorted(dags_old)[:1]) # Check if all records belong to one of the DAGs in the list just created. for record_id, record in patient_info.iteritems(): if (not record["dag"] in dags): print "Record with ID", record_id, "in DAG", record[ dag], "is part of unidentified DAG." # Enrolment by site per year report_path = os.path.join(out_dir, "enrolment_dag.csv") # Write row/column headings columns = year_list index = [dag if (dag != "") else "Unassigned" for dag in dags] # Create pandas DataFrame to store report. report_df = pandas.DataFrame(columns=columns, index=index) # Add row for each DAG. for dag in dags: if (dag != ""): dag_name = dag else: dag_name = "Unassigned" for year in year_list: num_enrolled_dag_year = 0 for record_id, record in patient_info.iteritems(): if ("enroll_date" in record) and (type(record["enroll_date"]) != type(year)): print "WARNING: comparison of different types in 'enroll_date'." if (record["dag"] == dag) and ("enroll_date" in record) and ( record["enroll_date"] == year): num_enrolled_dag_year += 1 report_df[year][dag_name] = num_enrolled_dag_year # Add columns/rows to store column/row totals. report_df["Total"] = report_df.sum(axis=1).astype(int) # Total column report_df = report_df.append( report_df.sum(axis=0).astype(int).rename("Total")) # Total row # Add instition name and country columns to dataframe. report_df = addDAGInfo(report_df, path_dag_info) report_df.to_csv(report_path) print report_df ## Enrolment by stroke type per year report_path = os.path.join(out_dir, "enrolment_stroke_type.csv") # Write row/column headings columns = year_list index = [ "Neonatal AIS", "Neonatal CSVT", "Neonatal AIS & CSVT", "Childhood AIS", "Childhood CSVT", "Childhood AIS & CSVT", "Presumed perinatal AIS", "Presumed perinatal CSVT", "Presumed perinatal AIS & CSVT", "Presumed perinatal VI", "Arteriopathy", "Other" ] report_df = pandas.DataFrame(0, columns=columns, index=index) # Add each patient with known stroke type to report. for id, record in patient_info.iteritems(): if ("enroll_date" in record) and ( record["enroll_date"] in columns ): # If enrolment date is known and included in the report. year = record["enroll_date"] if (record["stroke_type"]["neo_ais"] == "1") and (record["stroke_type"]["neo_csvt"] == "1"): report_df[year]["Neonatal AIS & CSVT"] += 1 elif (record["stroke_type"]["neo_ais"] == "1"): report_df[year]["Neonatal AIS"] += 1 elif (record["stroke_type"]["neo_csvt"] == "1"): report_df[year]["Neonatal CSVT"] += 1 elif (record["stroke_type"]["child_ais"] == "1") and (record["stroke_type"]["child_csvt"] == "1"): report_df[year]["Childhood AIS & CSVT"] += 1 elif (record["stroke_type"]["child_ais"] == "1"): report_df[year]["Childhood AIS"] += 1 elif (record["stroke_type"]["child_csvt"] == "1"): report_df[year]["Childhood CSVT"] += 1 elif (record["stroke_type"]["pp_ais"] == "1") and (record["stroke_type"]["pp_csvt"] == "1"): report_df[year]["Presumed perinatal AIS & CSVT"] += 1 elif (record["stroke_type"]["pp_ais"] == "1"): report_df[year]["Presumed perinatal AIS"] += 1 elif (record["stroke_type"]["pp_csvt"] == "1"): report_df[year]["Presumed perinatal CSVT"] += 1 elif (record["stroke_type"]["pp_vi"] == "1"): report_df[year]["Presumed perinatal VI"] += 1 elif (record["stroke_type"]["art"] == "1"): report_df[year]["Arteriopathy"] += 1 elif (record["stroke_type"]["other"] == "1"): report_df[year]["Other"] += 1 report_df["Total"] = report_df.sum(axis=1).astype(int) # Total column report_df = report_df.append( report_df.sum(axis=0).astype(int).rename("Total")) # Total row report_df.to_csv(report_path) print report_df return
def transferPSOMToIPSS(url_psom, key_psom, url_ipss, key_ipss, import_non_ipss_ids=False, out_path=None, manual_import=False): """ Transfer Summary of Impressions data from PSOM V2 to IPSS V4. Parameters: url_psom: str, API URL for PSOM V2 key_psom: str, API key for PSOM V2 url_ipss: str, API URL for IPSS V4 key_ipss: str, API key for IPSS V4 import_non_ipss_ids: bool, whether to import IDs that do not exist in IPSS V4 out_path: str, path to save data to be imported to IPSS to manual_import: bool, do not import data to IPSS; data saved to CSV can be manually imported by user Returns: None """ ## Define function which modifies PSOM data prior to import to IPSS. def modifyRecords(from_psom, url_ipss, key_ipss, import_non_ipss_ids=False): """ Take data exported from PSOM V2, and modify it such that it can be imported to IPSS V4. Parameters: from_psom: list of dicts; all records in the PSOM V2 database. Returns: to_ipss: list of dicts; all PSOM data to be imported into IPSS V3, after changing variable, event names etc. """ # In PSOM V2: # - The Summary of Impressions assessment is included only in the 'summary_of_impressions' instrument, which is part of the following events: # - 'acute_hospitalizat_arm_1' (repeating event) - all data collected during initial hospitalization # - 'initial_psom_arm_1' (non repeating) - data collected during first "initial PSOM" which occurred outside of the initial hospitalization # - 'follow_up_psom_arm_1' (repeating event) - all subsequently collected data. # In IPSS V4: # - The Summary of Impressions assessment is found in the instrument: # - 'summary_of_impressions' (non-repeating in event 'acute_arm_1'; repeating instrument in event 'followup_arm_1') # # Link PSOM to IPSS as follows: # - Map the instance of the PSOM V2 'summary_of_impressions' form which has the latest 'fuionset_soi' date in the 'acute_hospitalizat_arm_1' event to the IPSS V4 'summary_of_impressions' form in event 'acute_arm_1'. # - SOIs in the 'acute_hospitalizat_arm_1' event with earlier 'fuionset_soi' dates will not be mapped to IPSS V4. # - Map the PSOM V2 'summary_of_impressions' form in the 'initial_psom_arm_1' and 'follow_up_psom_arm_1' events to the IPSS V4 'summary_of_impression" form in event 'followup_arm_1'. # - Order the instances in IPSS V4 according to ascending 'fuionset_soi' dates in PSOM V2. # - All PSOM V2 summary of impressions (from any event), which have a blank 'fuionset_soi' date will be excluded from IPSS V4. #### Perform record-specific modifications to a few records. Ideally, there will be nothing in this section. #### Remove data from PSOM which will not be imported into IPSS V4. ## Remove all rows (any event) which do not have a PSOM assessment date. from_psom_after_exclusions = [] for row in from_psom: soi_date = row['fuionset_soi'] if (soi_date.strip() != ''): # if the summary of impressions date is not blank from_psom_after_exclusions.append(row) from_psom = from_psom_after_exclusions ## Remove all rows for records which do not exist in IPSS V4. if (not import_non_ipss_ids): from_psom_after_exclusions = [] excluded_ids = set( ) # set of IPSSIDs which exist in PSOM, but not in IPSS, and will not be imported. record_ids_ipss = getRecordIDList( url_ipss, key_ipss) # list of all records in IPSS. for row in from_psom: id = row['ipssid'] if (id in record_ids_ipss): from_psom_after_exclusions.append(row) elif ( not id in excluded_ids ): # if excluded ID has not already been identified and a warning printed. #warnings.warn("IPSSID not found in IPSS, not importing this patient's data: " + id) excluded_ids.add(id) from_psom = from_psom_after_exclusions #### Create dictionaries which map IPSSIDs to row numbers in PSOM for (a) the 'acute_hospitalizat_arm_1' event; and (b) the 'initial_psom_arm_1' and 'followup_arm_psom_arm_1' events. These dictionaries are used to determine which rows in PSOM V2 will be mapped to which (instances of which) events in IPSS V4. ## Create dictionary for the 'acute_hospitalizat_arm_1' event. Take only the acute_hospitalization row with the latest non-blank 'fuionset_soi' date. acute_dict = {} for row_index in range(len(from_psom)): row = from_psom[row_index] id = row['ipssid'] if (row['redcap_event_name'] == 'acute_hospitalizat_arm_1' ): # if row corresponds to 'acute_hospitalizat_arm_1' event psom_instance = row['redcap_repeat_instance'] psom_date = int( row['fuionset_soi'].replace('-', '') ) # Convert the string '2003-04-05' to the integer 20030405 for comparison later. if ( not id in acute_dict.keys() ): # if there is not yet an entry for this ID in acute_dict acute_dict[id] = (row_index, psom_instance, psom_date) elif ( psom_date > acute_dict[id][2] ): # if ID already in acute_dict and row corresponds to a more recent acute hospitalization instance acute_dict[id] = (row_index, psom_instance, psom_date) ## Create dictionary for the 'initial_psom_arm_1' and 'followup_psom_arm_1' events. followup_dict = {} for row_index in range(len(from_psom)): row = from_psom[row_index] id = row['ipssid'] psom_date = int(row['fuionset_soi'].replace('-', '')) if (row['redcap_event_name'] == 'initial_psom_arm_1'): followup_dict[id] = [ (row_index, 0, psom_date) ] # Assign a fake instance number of 0 to the initial_psom_arm_1 event (this was used in the old method of ordering based on instance number; now order based on 'fuionset_soi'). for row_index in range(len(from_psom)): row = from_psom[row_index] id = row['ipssid'] psom_instance = row['redcap_repeat_instance'] psom_date = int(row['fuionset_soi'].replace('-', '')) if (row['redcap_event_name'] == 'follow_up_psom_arm_1'): if (not id in followup_dict.keys()): followup_dict[id] = [(row_index, psom_instance, psom_date)] else: followup_dict[id].append( (row_index, psom_instance, psom_date)) ## Reorder the lists of (row index, PSOM instance) tuples in order of ascending PSOM instance number, so that the correct order will be retained in IPSS. for id, row_tuple_list_psom in followup_dict.iteritems(): row_tuple_list_psom.sort(key=lambda list_element: list_element[ 2]) # Sort list of tuples using the 'fuionset_soi' values. ## Check that follow-up rows are arranged in order of ascending 'fuisonset_soi'. Raise AssertionError if this is not true. This section has no effect on the data; it just checks for errors. for id, row_tuple_list_psom in followup_dict.iteritems(): last_date = 0 # fake date to compare the first date to. for row_tuple_psom in row_tuple_list_psom: current_date = row_tuple_psom[2] assert (current_date >= last_date) last_date = current_date #### Create functions and dictionaries for field mappings. ## Create a dictionary for Summary of Impressions (PSOM) -> 'summary_of_impressions' (IPSS V4)). The dictionary is of the form {field_name_in_PSOM: field_name_in_IPSS}. This dictionary only includes fields which are directly mapped to a corresponding IPSS V4 field. Fields which are modified prior to transfer are dealt with separately. psom_to_ipss_soi = { 'fuionset_soi': 'psomdate', 'fpsomr': 'psomr', 'fpsoml': 'psoml', 'fpsomlae': 'psomlae', 'fpsomlar': 'psomlar', 'fpsomcb': 'psomcb', 'psomsen___1': 'psomsens___3', 'psomsen___2': 'psomsens___4', 'psomsen___3': 'psomsens___5', 'psomsen___4': 'psomsens___6', 'psomsen___5': 'psomsens___7', 'psomsen___6': 'psomsens___8', 'psomsen___7': 'psomsens___9', 'psomsen___8': 'psomsens___10', 'psomsen___9': 'psomsens___11', 'psomsen___10': 'psomsens___12', 'psomsen___11': 'psomsens___13', 'psomsen___12': 'psomsens___14', 'othsens': 'senssp', 'fpsomco___1': 'psomcog___1', 'fpsomco___2': 'psomcog___2', 'totpsom': 'psomscr', 'summary_of_impressions_complete': 'summary_of_impressions_complete' } ## Create functions which perform the many-to-one mappings. def combineComments(row_psom): """ Combine multiple comment fields in PSOM into a single string to be imported into a single IPSS field. Parameters: row_psom: list of dicts, row in PSOM. Returns: combined_comments: str, value to be imported into a single IPSS field. """ ## Initialize the combined comments field. combined_comments = '' ## Add text from the PSOM 'lang_pro_det' (text) field if it is nonempty. if (row_psom['lang_pro_det'].strip() != ''): combined_comments += 'Language production deficits: ' + row_psom[ 'lang_pro_det'] + '. ' ## Add text from the PSOM 'lang_comp_det' (text) field if it is nonempty. if (row_psom['lang_comp_det'].strip() != ''): combined_comments += 'Language comprehension deficits: ' + row_psom[ 'lang_comp_det'] + '. ' ## Add text from the PSOM 'cog_beh_det' (checkbox) field if any options are checked. # Create dictionary mapping checkbox number to checkbox option text. cog_beh_det_dict = { 1: 'Remembering what he/she learned', 2: 'Staying focused', 3: 'Sad or low moods', 4: 'Excessive worries', 5: 'Getting along with other children', 6: 'Other' } # Keep note of whether any of the checkboxes are checked for this row. any_checked = False # Loop over the checkbox option numbers. for box_number in range(1, 6 + 1): box_var = 'cog_beh_det___' + str( box_number) # field name for current checkbox option if (row_psom[box_var] == '1'): # if checkbox is checked if ( not any_checked ): # if this is the first checked option found for the current row combined_comments += 'Cognitive/behavioural deficits: ' any_checked = True combined_comments += cog_beh_det_dict[box_number] + ', ' # Replace trailing comma with trailing period if (combined_comments[-2:] == ', '): combined_comments = combined_comments[:-2] + '. ' ## Add text from the PSOM 'cbcomm' (text) field if it is nonempty. if (row_psom['cbcomm'].strip() != ''): combined_comments += 'Other cognitive/behavioural deficits: ' + row_psom[ 'cbcomm'] + '. ' ## Add text from the PSOM 'stroke_cause_y_n' (yesno) field if it is nonempty. if (row_psom['stroke_cause_y_n'] != ''): combined_comments += 'Are all neurologic deficits attributable to stroke?: ' if (row_psom['stroke_cause_y_n'] == '1'): combined_comments += 'Yes. ' else: combined_comments += 'No. ' ## Add text from the PSOM 'cause_det' (notes) field if it is nonempty. if (row_psom['cause_det'].strip() != ''): combined_comments += 'Specify which deficits are not attributable to stroke, and state responsible diagnosis: ' + row_psom[ 'cause_det'] ## Strip trailing comma. if (combined_comments[-2:] in [', ', '. ']): combined_comments = combined_comments[:-2] return combined_comments #### Build the import data for IPSS V4 using the IPPSID-to-row-number mappings. ## Initialize data to be imported into IPSS V4. to_ipss = [] ## Map data to IPSS 'acute_arm_1' event. for id, row_tuple_psom in acute_dict.iteritems( ): # Loop over IPSSIDs which have at least one 'acute_hospitalizat_arm_1' event in PSOM V2. row_index_psom = row_tuple_psom[0] row_psom = from_psom[ row_index_psom] # PSOM row to be imported into IPSS. assert ( row_psom['redcap_event_name'] == 'acute_hospitalizat_arm_1' ) # Check that PSOM row corresponds to the appropriate PSOM event. # Initialize the row to be imported into IPSS. row_ipss = { 'ipssid': id, 'redcap_event_name': 'acute_arm_1', 'redcap_repeat_instrument': '', 'redcap_repeat_instance': '' } # Add the variables with a one-to-one mapping. for field_name_psom, field_name_ipss in psom_to_ipss_soi.iteritems( ): value = row_psom[field_name_psom] row_ipss[field_name_ipss] = value # Add the variables with a many-to-one mapping sdcom = combineComments(row_psom) row_ipss['sdcom'] = sdcom # Append row to IPSS data. to_ipss.append(row_ipss) ## Map data to IPSS 'followup_arm_1' evemt. Note that the followup rows have already been ordered based on fuionset_soi at this point. for id, row_tuple_list_psom in followup_dict.iteritems(): instance_ipss = 1 # instance number for current row in IPSS for row_tuple_psom in row_tuple_list_psom: row_index_psom = row_tuple_psom[0] row_psom = from_psom[row_index_psom] assert ( row_psom['redcap_event_name'] != 'acute_hospitalizat_arm_1' ) # Check that PSOM row corresponds to the appropriate PSOM events. # Initialize the row to be imported into IPSS. row_ipss = { 'ipssid': id, 'redcap_event_name': 'followup_arm_1', 'redcap_repeat_instrument': 'summary_of_impressions', 'redcap_repeat_instance': str(instance_ipss) } # Add the variables with a one-to-one mapping. for field_name_psom, field_name_ipss in psom_to_ipss_soi.iteritems( ): value = row_psom[field_name_psom] row_ipss[field_name_ipss] = value # Add the variables with a many-to-one mapping sdcom = combineComments(row_psom) row_ipss['sdcom'] = sdcom ## Append row to IPSS data. to_ipss.append(row_ipss) ## Increment the IPSS instance number instance_ipss += 1 return to_ipss ## Export Summary of Impressions data from PSOM. from_psom = exportRecords(url_psom, key_psom, fields=None, forms=None, quiet=True, export_form_completion=True) ## Map the PSOM data to IPSS fields. to_ipss = modifyRecords(from_psom, url_ipss, key_ipss, import_non_ipss_ids=import_non_ipss_ids) ## Save data to be imported to a CSV file. if out_path: saveToCsv(to_ipss, out_path) ## Import data to IPSS. if manual_import: print "Skipping automatic import of data. Data to be imported into IPSS V4 was saved to '" + out_path + "'. This file should be imported with the setting \"Allow blank values to overwrite existing saved values?\" set to \"Yes\"." else: importRecords(url_ipss, key_ipss, to_ipss, overwrite='overwrite', quick=True, return_content='count') return
forms_list[project_index] = exportFormsOrdered(api_url_list[project_index], api_key_list[project_index]) # Generate a dictionary with form_names as keys; each entry is a dict specifying in which # events the form is non-repeating, indpendently repeating, or dependently repeating. form_repetition_map_list[project_index] = createFormRepetitionMap(project_longitudinal_list[project_index], project_repeating_list[project_index], form_event_mapping_list[project_index], repeating_forms_events_list[project_index], forms_list[project_index]) # Gather data about each variable. metadata_list[project_index] = parseMetadata(def_field_list[project_index], project_info_list[project_index], project_longitudinal_list[project_index], project_repeating_list[project_index], events_list[project_index], metadata_list[project_index], form_event_mapping_list[project_index], repeating_forms_events_list[project_index], forms_list[project_index], form_repetition_map_list[project_index]) # Load all records. # if (project_index == 0): # USED BEFORE REVERSING ORDER OF PROJECT DATA RETRIEVAL if (project_index == 1):# USED AFTER REVERSING ORDER OF PROJECT DATA RETRIEVAL records_list[project_index] = exportRecords(api_url_list[project_index], api_key_list[project_index]) else: # Only pull record IDs from second project that exist in first project. # records_list[project_index] = exportRecords(api_url_list[project_index], api_key_list[project_index], record_id_list=[record_id for record_id in record_id_map_list[0]]) # USED BEFORE REVERSING ORDER OF PROJECT DATA RETRIEVAL # Only pull record IDs from first project that exist in second project. records_list[project_index] = exportRecords(api_url_list[project_index], api_key_list[project_index], record_id_list=[record_id for record_id in record_id_map_list[1]]) # USED AFTER REVERSING ORDER OF PROJECT DATA RETRIEVAL # Check for high-level issues in project settings, metadata, records. project_compatible_list[project_index] = isProjectCompatible(metadata_list[project_index], records_list[project_index], def_field_list[project_index]) if (not project_compatible_list[project_index]): sys.exit() # Generate a non redundant list of record IDs. record_id_map_list[project_index] = createRecordIDMap(def_field_list[project_index], records_list[project_index])
def mainIntraProject(config_path): config = readConfig(config_path) print "Performing checks with configuration:" pprint(config) print #### Read user's settings.yml file, which will be used to get API tokens and URLs. api_settings = ApiSettings( ) # Create instance of ApiSettings class. Use this to find file containing API keys and URLs. # Determine the API URL and API token based on the users input and api_keys.yml file. code_name = config["code_name"] api_url, api_key, code_name = api_settings.getApiCredentials( code_name=code_name) # Create output directory if it does not exist. out_dir = config["out_dir"] if (not os.path.isdir(out_dir)): os.mkdir(out_dir) print "Created directory:", out_dir # Define a list containing the lists of Check objects (defined in Check.py). check_name_list = config["checks"] check_paths_exist = True for check_name in check_name_list: scriptdir = os.path.dirname(os.path.realpath(__file__)) check_path = os.path.join(scriptdir, check_name + ".py") if not os.path.exists(check_path): raise Exception("Path does not exist:", check_path) # Load REDCap project (a PyCap object). project = redcap.Project(api_url, api_key) # Get the field name of the unique identifying field (e.g. "ipssid"). def_field = project.def_field # Load high-level projct information. project_info = exportProjectInfo(api_url, api_key) project_longitudinal = bool(project_info["is_longitudinal"]) project_repeating = bool( project_info["has_repeating_instruments_or_events"]) # Load list of events events = getEvents(api_url, api_key) #project, project_info, project_longitudinal) if (not events == None): print "Review the event_ids below. These are required for generating links to problematic data in reports. If these are incorrect, or unset, you can set them in the event_ids.yml file specified in your settings.yml file. You can find the event_id associated with an event by accessing data from that event online, and looking at the value of 'event_id' in the address bar." for event in events: event_id = events[event]["event_id"] if (not event_id == None): print Color.green + event + " " + event_id + Color.end else: print Color.red + event + " " + 'None' + Color.end print # Load raw data dictionary. metadata_raw = project.export_metadata() # Load instrument-event mapping form_event_mapping = exportFormEventMapping(project, project_longitudinal) # Load information specifying which forms are repeating. repeating_forms_events = exportRepeatingFormsEvents( api_url, api_key, project_repeating) # Generate list of forms - list of dicts with two keys: 'instrument_label' and 'instrument_name' forms = exportFormsOrdered(api_url, api_key) # Generate a dictionary with form_names as keys; each entry is a dict specifying in which # events the form is non-repeating, indpendently repeating, or dependently repeating. form_repetition_map = createFormRepetitionMap(project_longitudinal, project_repeating, form_event_mapping, repeating_forms_events, forms) # Gather data about each variable. metadata = parseMetadata(def_field, project_info, project_longitudinal, project_repeating, events, metadata_raw, form_event_mapping, repeating_forms_events, forms, form_repetition_map) ## Load all records. if config["use_getIPSSIDs"]: getIPSSIDs_args = config["getIPSSIDs_args"] record_id_list = getIPSSIDs(**getIPSSIDs_args) elif config["use_custom_record_id_list"]: record_id_list = config["record_id_list"] else: record_id_list = None records = exportRecords(api_url, api_key, record_id_list) # Check for high-level issues in project settings, metadata, records. # 2020-05-11 - This script appears to check for bugged output of exportRecords.py, which has now been handled in exportRecords.py. # project_compatible = isProjectCompatible(metadata, records, def_field) # if (not project_compatible): # raise Exception("Error found in records or metadata. Review output above.") # Generate a dictionary with record IDs as keys and a list of row numbers corresponding to that record as values. record_id_map = createRecordIDMap(def_field, records) # Generate a list of data access groups if they exist. dags_used, dags = getDAGs(records) # Generate a dictionary containing information about each dag (e.g. number of records they contain). dag_record_map = createDAGRecordMap(def_field, records, record_id_map, dags_used, dags) # Generate list of checks to perform (default & user-defined). checklist = createChecklist(check_name_list) # Perform checks on data and report issues. check_results = checkDriver(checklist, out_dir, def_field, forms, project_info, project_longitudinal, project_repeating, events, metadata, form_event_mapping, repeating_forms_events, form_repetition_map, records, record_id_map, dags_used, dags, dag_record_map) # # Save data exported from REDCap and generated in this script. The check results are saved by checkDriver() above. # saveData(out_dir, project, forms, project_info, metadata, record_id_map, dags_used, dags, check_results) return
def getIPSSIDs(from_code_name="ipss_v4", ex_registry_only=False, ex_unknown_stroke_type=False, ex_pre_2003=False, ex_pre_2014=False, ex_post_20191001=False, ex_sk_patients=False, ex_neonatal_stroke=False, ex_placeholders=False, ex_adult_stroke=False, ex_melas=False, ex_non_ipss=False, ex_non_sips=False, ex_non_sips2=False, ex_non_sips2_cohort1=False, ex_non_sips2_cohort2=False, ex_sips_exclusions=False, ex_sips_exclusions_2=False, ex_patient_info_incomp=False, ex_core_incomplete=False, ex_non_vips_enrolled=False, ex_vips_screen_nonenroll=False): ''' Parameters: from_code_name: str - code_name of database to get IDs from. Allowed values are the code names defined in user's api_keys.yml file. ex_registry_only: bool - whether to exclude IDs of SickKids registry-only patients based on IPSS V4 data ex_unknown_stroke_type: bool - whether to exclude IDs of records with unknown stroke type based on IPSS V4 data ex_pre_2003: bool - whether to exclude IDs of patients enrolled before 2003 based on IPSS V4 field dateentered ex_pre_2014: bool - whether to exclude IDs of patients enrolled before 2014 based on IPSS V4 field originalipss ex_post_20191001 bool - whether to indlude IDs added on or after 2019-10-01 based on the IPSS V4 field dateentered ex_sk_patients: bool - whether to exclude IDs of patients with 'hsc' data access group in IPSS V4 ex_neonatal_stroke: bool - whether to exclude IDs of patients who suffered a neonatal stroke based on IPSS V4 data ex_placeholders: bool - whether to exclude IDs of patients with almost no real data, whose records likely exist as placeholders based on IPSS V4 data ex_adult_stroke: bool - whether to exclude IDs of patients who did not suffer a stroke as a child based on IPSS V4 data ex_melas: bool - whether to exclude IDs patients with MELAS based on IPSS V4 data ex_non_ipss: bool - whether to exclude IDs that do not exist in IPSS V4 ex_non_sips: bool - whether to exclude IDs that are not in SIPS I or SIPS II based on IPSS V4 data ex_non_sips2: bool - whether to exclude IDs that are not in SIPS II based on IPSS V4 data ex_non_sips2_cohort1 bool - whether to exclude IDs that are not in SIPS II cohort I based on IPSS V4 data ex_non_sips2_cohort2 bool - whether to exclude IDs that are not in SIPS II cohort II based on IPSS V4 data ex_sips_exclusions: bool - whether to exclude SIPS patients that were excluded merely screened based on SIPS II V2 field 'screened' ex_sips_exclusions_2: bool - whether to exclude SIPS patients that were excluded based screened based on SIPS II V2 field 'screened' and 'actcohortsp' ex_patient_info_incomp bool - whether to exclude IDs of patients for whom patient_information_complete != 2 in IPSS V2, *but still include all SLCH patients*. ex_core_incomplete bool - whether to exclude IDs of patients for whom any of the IPSS V4 'core' forms are not marked as complete (core forms, here, include 'patient_information', 'cardiac_arteriopathy_risk_factors', 'other_child_and_neonatal_risk_factors', and 'clinical_presentation') ex_non_vips_enrolled: bool - whether to exclude patients who are not enrolled in VIPS, based on the condition of the VIPS II field vscreen_sfoutc=4. ex_vips_screen_nonenroll: bool - whether to exclude patients who are "VIPS screened, not enrolled" based on the IPSS V4 field 'vips_screened' Returns: record_ids: list - record IDs in the specified database after specified exclusions ''' ## Get list of exlusions requested (i.e. get a list of all args set to False). exclusion_args = [] for key, val in locals().iteritems(): if (val is True) and (key[:3] == 'ex_'): exclusion_args.append(key) ## Get API tokens and URLs for all projects used in the filters api_settings = ApiSettings() # Create instance of ApiSettings class. Use this to find file containing API keys and URLs. token_dict = {} # keys are code names, values are a tuple with (url, token) for that project for code_name in ["ipss_arch", "ipss_v4", "sips2_v2", "vips2", "psom_v2"]: url, key, code_name = api_settings.getApiCredentials(code_name=code_name) token_dict[code_name] = (url, key) ## Get list of all record IDs in the database specified with the 'from_code_name' option from_url, from_key, from_code_name = api_settings.getApiCredentials(code_name=from_code_name) record_ids_all = getRecordIDList(from_url, from_key) ## Generate lists of fields & events which must be exported from each project in order to filter the record IDs. It would be fine to export all data, but this would take a long time. required_data_dict = { "ex_registry_only":{ "ipss_v4":{ "fields":["substud"], "events":["acute_arm_1"] } }, "ex_unknown_stroke_type":{ "ipss_v4":{ "fields":["stroke_type"], "events":["acute_arm_1"] } }, "ex_pre_2003":{ "ipss_v4":{ "fields":["dateentered"], "events":["acute_arm_1"] } }, "ex_pre_2014":{ "ipss_v4":{ "fields":["originalipss"], "events":["acute_arm_1"] } }, "ex_post_20191001":{ "ipss_v4":{ "fields":["dateentered"], "events":["acute_arm_1"] } }, "ex_sk_patients":{ "ipss_v4":{ "fields":["ipssid"], "events":["acute_arm_1", "followup_arm_1"] # Need both in case record only has data in followup_arm_1 } }, "ex_neonatal_stroke":{ "ipss_v4":{ "fields":["strage"], "events":["acute_arm_1"] } }, "ex_placeholders":{ "ipss_v4":{ "forms":["clinical_presentation", "cardiac_and_arteriopathy_risk_factors"], # note that here we need all fields in the form. "events":["acute_arm_1"] } }, "ex_adult_stroke":{ "ipss_v4":{ "fields":["birmont", "biryear", "doe", "daent", "strage", "substud"], "events":["acute_arm_1"] } }, "ex_melas":{ "ipss_v4":{ "fields":["genetsy", "genetsys"], "events":["acute_arm_1"] } }, "ex_non_ipss":{ "ipss_v4":{ "fields":["ipssid"], "events":["acute_arm_1", "followup_arm_1"] # Need both in case record only has data in followup_arm_1 } }, "ex_non_sips":{ "ipss_v4":{ "fields":["substud"], "events":["acute_arm_1"] } }, "ex_non_sips2":{ "ipss_v4":{ "fields":["substud"], "events":["acute_arm_1"] } }, "ex_non_sips2_cohort1":{ "ipss_v4":{ "fields":["substud", "sip_cohort"], "events":["acute_arm_1"] } }, "ex_non_sips2_cohort2":{ "ipss_v4":{ "fields":["substud", "sip_cohort"], "events":["acute_arm_1"] } }, "ex_sips_exclusions":{ "sips2_v2":{ "fields":["screened"], "events":["confirmation_and_t_arm_1"] } }, "ex_sips_exclusions_2":{ "sips2_v2":{ "fields":["screened", "actcohortsp"], "events":["confirmation_and_t_arm_1", "acute_arm_1"] } }, "ex_patient_info_incomp":{ "ipss_v4":{ "fields":["patient_information_complete"], "events":["acute_arm_1"] } }, "ex_core_incomplete":{ "ipss_v4":{ "fields":["patient_information_complete", "cardiac_and_arteriopathy_risk_factors_complete", "other_child_and_neonatal_risk_factors_complete", "clinical_presentation_complete", "status_at_discharge_complete"], "events":["acute_arm_1"] } }, "ex_non_vips_enrolled":{ "vips2":{ "fields":["vscreen_sfoutc"], "events":["confirmation_and_t_arm_1", "confirmation_and_t_arm_2"] } }, "ex_vips_screen_nonenroll":{ "ipss_v4":{ "fields":["vips_screened"], "events":["acute_arm_1"] } } } ## Build dicts for arguments to be passed to exportRecords. exportRecords_args = {} # keys are the code names of projects from which data will be export. Values are the args to be passed to exportRecords for those projects. for arg in exclusion_args: for code_name, project_data in required_data_dict[arg].iteritems(): if (not code_name in exportRecords_args.keys()): exportRecords_args[code_name] = {"fields":None, "forms":None, "events":None} for key, val in project_data.iteritems(): if (exportRecords_args[code_name][key] is None): exportRecords_args[code_name][key] = val else: exportRecords_args[code_name][key].extend(val) # Remove duplicates from the lists of args (not necessary, but do it for visual clarity). for code_name, args in exportRecords_args.iteritems(): for arg, val in args.iteritems(): if (not val is None): args[arg] = list(set(val)) ## Export all data required for the specified filters. Don't export data from unneed projects. filter_data = {} # keys are project code names; values are the actual data sets exported from the projects. for code_name, args in exportRecords_args.iteritems(): api_url = token_dict[code_name][0] api_key = token_dict[code_name][1] filter_data[code_name] = exportRecords(api_url, api_key, **args) #### Generate lists of IDs to exclude based on specified exclusions. ## Keep track of which records are excluded by each exclusion argument. excluded_ids = {} # key is exclusin arg; value is list of IDs excluded by that condition. ## SickKids Registry-only patients if ex_registry_only: # Registry-only records based on IPSS data # * All patients labelled registry-only in Archive are labelled likewise in IPSS. # * All '9203-' patients are labelled registry only in IPSS. excluded_ids["ex_registry_only"] = set() for row in filter_data['ipss_v4']: id = row['ipssid'] if (row["substud___8"] == "1") or ('9203-' in id): excluded_ids["ex_registry_only"].add(id) ## Unknown stroke type if ex_unknown_stroke_type: record_ids_known_stroke_type_ipss = set() for row in filter_data['ipss_v4']: if (row["stroke_type___1"] == "1") or (row["stroke_type___2"] == "1") or (row["stroke_type___3"] == "1") or (row["stroke_type___4"] == "1") or (row["stroke_type___5"] == "1") or (row["stroke_type___6"] == "1") or (row["stroke_type___7"] == "1") or (row["stroke_type___8"] == "1") or (row['stroke_type___9'] == "1"): id = row["ipssid"] record_ids_known_stroke_type_ipss.add(id) excluded_ids["ex_unknown_stroke_type"] = set([id for id in record_ids_all if (not id in record_ids_known_stroke_type_ipss)]) ## Patients that were entered before 2003 if ex_pre_2003: excluded_ids["ex_pre_2003"] = set() for row in filter_data['ipss_v4']: id = row["ipssid"] date_entered = row["dateentered"].replace('-','') # convert 2019-01-23 to 20190123 try: if (int(date_entered) < 20030101): excluded_ids["ex_pre_2003"].add(id) else: pass except ValueError: # occurs when value stored in 'dateentered' is blank (or possible another nonsense format) if (id[:2] == '7-'): # all '7-' patients are assume to be added after 2003. continue elif (id[:5] == '9203-'): # all '9203-' patients are known to be entered before 2003. excluded_ids["ex_pre_2003"].add(id) else: # If record has not dateentered, and is not a 7- or 9203- patient (who are known to be added before 20191001) print "Warning: Assuming record '"+id+"' was added after 2003" ## Patients that have been entered since the launch of IPSS in REDCap from 2014 to present. if ex_pre_2014: excluded_ids["ex_pre_2014"] = set() for row in filter_data["ipss_v4"]: if (row["originalipss___1"] == '1'): id = row["ipssid"] excluded_ids["ex_pre_2014"].add(id) ## Patients that were entered after 2019-10-01 if ex_post_20191001: excluded_ids["ex_post_20191001"] = set() for row in filter_data["ipss_v4"]: id = row["ipssid"] date_entered = row["dateentered"].replace('-','') # convert 2019-01-23 to 20190123 try: if (int(date_entered) >= 20191001): excluded_ids["ex_post_20191001"].add(id) else: pass except ValueError: # occurs when value stored in 'dateentered' is blank (or possible another nonsense format) if (id[:2] == '7-') or (id[:5] == '9203-'): continue else: # If record has not dateentered, and is not a 7- or 9203- patient (who are known to be added before 20191001) excluded_ids["ex_post_20191001"].add(id) print "Warning: Assuming record '"+id+"' was added after 20191001" ## SickKids patients (based on DAG) if ex_sk_patients: excluded_ids["ex_sk_patients"] = set() for row in filter_data["ipss_v4"]: if (row["redcap_data_access_group"] == "hsc"): id = row['ipssid'] excluded_ids["ex_sk_patients"].add(id) ## Neonatal stroke based on IPSS data if ex_neonatal_stroke: excluded_ids["ex_neonatal_stroke"] = set() for row in filter_data["ipss_v4"]: if (row['strage'] == '0'): id = row['ipssid'] excluded_ids["ex_neonatal_stroke"].add(id) ## Records deemed to be "placeholders" based on condition that they have no data in either the clinical presentation or cardiac risk factors forms. if ex_placeholders: record_ids_nonplaceholder_ipss = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (id in record_ids_nonplaceholder_ipss): continue row_has_data = False for field, value in row.iteritems(): if (field in ['ipssid', 'redcap_data_access_group', 'redcap_event_name', 'redcap_repeat_instrument', 'redcap_repeat_instance', 'clinical_presentation_complete', 'cardiac_and_arteriopathy_risk_factors_complete']): continue elif ('___' in field): # if it is a checkbox field if (value == '1'): # if checkbox is checked row_has_data = True break else: if (value != ''): # if field has data row_has_data = True break if row_has_data: record_ids_nonplaceholder_ipss.add(id) excluded_ids["ex_placeholders"] = [id for id in record_ids_all if (not id in record_ids_nonplaceholder_ipss)] ## Non-pediatric stroke (>= 19 years of age at date of stroke) if ex_adult_stroke: excluded_ids["ex_adult_stroke"] = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row['birmont'] != '') and (row['biryear'] != '') and (row['doe'] != ''): stroke_age = float(row['doe'][:4]) + float(row['doe'][5:7])/12 - float(row['biryear']) - float(row['birmont'])/12 if (stroke_age >= 19.0 + 1.0/12.0): # only year/month of birth is known, so pad cutoff by 1 month. excluded_ids["ex_adult_stroke"].add(id) elif (row['birmont'] != '') and (row['biryear'] != '') and (row['daent'] != ''): stroke_age = float(row['daent'][:4]) + float(row['daent'][5:7])/12 - float(row['biryear']) - float(row['birmont'])/12 if (stroke_age >= 19.0 + 1.0/12.0): # only year/month of birth is known, so pad cutoff by 1 month. excluded_ids["ex_adult_stroke"].add(id) ## MELAS patients if ex_melas: excluded_ids["ex_melas"] = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] desc = row['genetsys'].lower() if (row['genetsy'] == '1') and (('melas' in desc) or ('mitochondrial encephalopathy' in desc) or ('lactic acidosis' in desc)): excluded_ids["ex_melas"].add(id) ## Patients not in IPSS database if ex_non_ipss: ipss_ids = set() for row in filter_data["ipss_v4"]: id = row["ipssid"] ipss_ids.add(id) excluded_ids["ex_non_ipss"] = [id for id in record_ids_all if (not id in ipss_ids)] ## Patients not enrolled in SIPS I or SIPS II if ex_non_sips: record_ids_sips_ipss = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row["substud___4"] == "1") or (row["substud___6"] == "1"): record_ids_sips_ipss.add(id) excluded_ids["ex_non_sips"] = [id for id in record_ids_all if (not id in record_ids_sips_ipss)] ## Patients not enrolled in SIPS II if ex_non_sips2: record_ids_sips2_ipss = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row["substud___4"] == "1"): record_ids_sips2_ipss.add(id) excluded_ids["ex_non_sips2"] = [id for id in record_ids_all if (not id in record_ids_sips2_ipss)] ## SIPS II cohort I patients if ex_non_sips2_cohort1: record_ids_sips2_cohort1_ipss = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row["substud___4"] == "1") and (row['sip_cohort'] == '1'): record_ids_sips2_cohort1_ipss.add(id) excluded_ids["ex_non_sips2_cohort1"] = [id for id in record_ids_all if (not id in record_ids_sips2_cohort1_ipss)] ## SIPS II cohort II patients if ex_non_sips2_cohort2: record_ids_sips2_cohort2_ipss = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row["substud___4"] == "1") and (row['sip_cohort'] == '2'): record_ids_sips2_cohort2_ipss.add(id) excluded_ids["ex_non_sips2_cohort2"] = [id for id in record_ids_all if (not id in record_ids_sips2_cohort2_ipss)] ## Patients excluded from SIPS studies based on SIPS II variable 'screened' if ex_sips_exclusions: excluded_ids["ex_sips_exclusions"] = set() for row in filter_data["sips2_v2"]: id = row['ipssid'] if (row["screened"] in ['1', '3']): excluded_ids["ex_sips_exclusions"].add(id) ## Patients excluded from SIPS studies based on SIPS II variables 'screened' and 'actcohortsp' if ex_sips_exclusions_2: excluded_ids["ex_sips_exclusions_2"] = set() actcohort2_ids = set() for row in filter_data["sips2_v2"]: id = row['ipssid'] if (row["screened"] in ['1']): excluded_ids["ex_sips_exclusions_2"].add(id) if (row['actcohortsp'] == '2'): actcohort2_ids.add(id) for row in filter_data["sips2_v2"]: id = row['ipssid'] if (id in actcohort2_ids) and (row['screened'] == '3'): excluded_ids["ex_sips_exclusions_2"].add(id) ## Patients for whom patient_information_complete != 2. if ex_patient_info_incomp: record_ids_patient_information_complete_ipss = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row['patient_information_complete'] == '2') or (row['redcap_data_access_group'] == 'slch'): record_ids_patient_information_complete_ipss.add(id) excluded_ids["ex_patient_info_incomp"] = [id for id in record_ids_all if (not id in record_ids_patient_information_complete_ipss)] ## Patients for whom any of the "core" forms are not marked as complete if ex_core_incomplete: excluded_ids["ex_core_incomplete"] = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row['patient_information_complete'] != '2') or (row['cardiac_and_arteriopathy_risk_factors_complete'] != '2') or (row['other_child_and_neonatal_risk_factors_complete'] != '2') or (row['clinical_presentation_complete'] != '2'): excluded_ids["ex_core_incomplete"].add(id) # Patients who are not enrolled in VIPS. if ex_non_vips_enrolled: record_ids_vips_enrolled = set() for row in filter_data["vips2"]: id = row['ipssid'] if (row['vscreen_sfoutc'] == '4'): record_ids_vips_enrolled.add(id) excluded_ids["ex_non_vips_enrolled"] = [id for id in record_ids_all if (not id in record_ids_vips_enrolled)] # Patients who are "VIPS screened not enrolled" based on the IPSS field 'vips_screened'. if ex_vips_screen_nonenroll: excluded_ids["ex_vips_screen_nonenroll"] = set() for row in filter_data["ipss_v4"]: id = row['ipssid'] if (row['vips_screened'] == '1'): excluded_ids["ex_vips_screen_nonenroll"].add(id) ## Remove all excluded IDs, and return the filtered list. record_ids = record_ids_all for exclusion, excluded_id_set in excluded_ids.iteritems(): record_ids = [id for id in record_ids if (not id in excluded_id_set)] return record_ids
def getPatientInfo(url_arch, url_ipss, key_arch, key_ipss, enroll_date_min=2003, enroll_date_max=2020): # Create one list of record ID which are non-registry and have known stroke type. #record_ids = getIPSSIDs(ex_registry_only=True, ex_unknown_stroke_type=True, from_code_name="ipss_v3") #print "DEBUG: CHANGE getIPSSIDs arguments back to IPSS V4." record_ids = getIPSSIDs(ex_registry_only=True, ex_unknown_stroke_type=True) ## Create dict with patient information: {record_id: {dag:"...", enroll_date:"...", ...} } patient_info = {} for record_id in record_ids: # add item (another dict) for each patient in the Archive patient_info[record_id] = {} ## Get enrolment date for each record. # Archive - Use 'dateofentry', then 'visit_date". dateofentry_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids, fields=["dateofentry"], events=["acute_arm_1"], validate=False) for row in dateofentry_arch: if (row["dateofentry"] == ""): pass else: if ("enroll_date" in patient_info[row["pk_patient_id"]]): print "This record was counted twice: "+str(row["pk_patient_id"]) continue patient_info[row["pk_patient_id"]]["enroll_date"] = int(row["dateofentry"][:4]) num_missing = len([id for id in record_ids if (not "enroll_date" in patient_info[id])]) record_ids_leftover = [id for id in record_ids if (not "enroll_date" in patient_info[id])] visit_date_leftover = exportRecords(url_arch, key_arch, record_id_list=record_ids_leftover, fields=["visit_date"], events=["acute_arm_1"], validate=False) for row in visit_date_leftover: if (row["visit_date"] == ""): pass else: if ("enroll_date" in patient_info[row["pk_patient_id"]]): print "This record was counted twice: "+str(row["pk_patient_id"]) continue patient_info[row["pk_patient_id"]]["enroll_date"] = int(row["visit_date"][:4]) num_missing = len([id for id in record_ids if (not "enroll_date" in patient_info[id])]) # IPSS - use 'dateentered' (works for all but 6 patients). record_ids_leftover = [id for id in record_ids if (not "enroll_date" in patient_info[id])] dateentered_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids_leftover, fields=["dateentered"], events=["acute_arm_1"], validate=False) for row in dateentered_ipss: if (row["dateentered"] == ""): pass else: if ("enroll_date" in patient_info[row["ipssid"]]): print "This record was counted twice: "+str(row["ipssid"]) continue patient_info[row["ipssid"]]["enroll_date"] = int(row["dateentered"][:4]) num_missing = len([id for id in record_ids if (not "enroll_date" in patient_info[id])]) enroll_dates = set() for id, info in patient_info.iteritems(): if ('enroll_date' in info): enroll_dates.add(info['enroll_date']) if (not info['enroll_date'] in range(enroll_date_min, enroll_date_max+1)): print "Record enroll date outside ["+str(enroll_date_min)+", "+str(enroll_date_max)+"]:", id else: print "Record with no enrolment date:", id ## Get DAG for each record: dags_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids, fields=["pk_patient_id"], validate=False) dags_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["ipssid"], validate=False) for row in dags_arch: record_id = row["pk_patient_id"] dag = row["redcap_data_access_group"] patient_info[record_id]["dag"] = dag for row in dags_ipss: record_id = row["ipssid"] dag = row["redcap_data_access_group"] if (not "dag" in patient_info[record_id]) or (patient_info[record_id]["dag"] == ""): # add DAG from IPSS if not added already patient_info[record_id]["dag"] = dag # overwriting DAG for records in Archive should not be a problem. ## Get stroke type for each patient. # Need to decide how we want to break this down further. #stroke_type_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["chais", "chcsvt", "neoais", "neocsvt", "ppis", "ppcsvt", "pvi", "preart", "othcond"], events=["acute_arm_1"]) stroke_type_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["stroke_type"], events=["acute_arm_1"]) # Set stroke types to unknown initially. for record_id in patient_info: patient_info[record_id]["stroke_type"] = {} patient_info[record_id]["stroke_type"]["neo_ais"] = "2" patient_info[record_id]["stroke_type"]["neo_csvt"] = "2" patient_info[record_id]["stroke_type"]["child_ais"] = "2" patient_info[record_id]["stroke_type"]["child_csvt"] = "2" patient_info[record_id]["stroke_type"]["pp_ais"] = "2" patient_info[record_id]["stroke_type"]["pp_csvt"] = "2" patient_info[record_id]["stroke_type"]["pp_vi"] = "2" patient_info[record_id]["stroke_type"]["art"] = "2" patient_info[record_id]["stroke_type"]["other"] = "2" #'chais___1':'stroke_type___1', #'chcsvt___1':'stroke_type___2', #'neoais___1':'stroke_type___3', #'neocsvt___1':'stroke_type___4', #'ppis___1':'stroke_type___5', #'ppcsvt___1':'stroke_type___6', #'pvi___1':'stroke_type___7', #'preart___1':'stroke_type___8', #'othcond___1':'stroke_type___9' for row in stroke_type_ipss: # 0 - no, 1 - yes, 2 - unknown record_id = row["ipssid"] # neonatal AIS patient_info[record_id]["stroke_type"]["neo_ais"] = row["stroke_type___3"] # neonatal CSVT patient_info[record_id]["stroke_type"]["neo_csvt"] = row["stroke_type___4"] # child AIS patient_info[record_id]["stroke_type"]["child_ais"] = row["stroke_type___1"] # child CSVT patient_info[record_id]["stroke_type"]["child_csvt"] = row["stroke_type___2"] # presumed perinatal AIS patient_info[record_id]["stroke_type"]["pp_ais"] = row["stroke_type___5"] # presumed perinatal CSVT patient_info[record_id]["stroke_type"]["pp_csvt"] = row["stroke_type___6"] # presumed perinatal VI patient_info[record_id]["stroke_type"]["pp_vi"] = row["stroke_type___7"] # arteriopathy patient_info[record_id]["stroke_type"]["art"] = row["stroke_type___8"] # other patient_info[record_id]["stroke_type"]["other"] = row["stroke_type___9"] # Look for patients without an identified stroke type. record_ids_with_unidentified_stroke_type = [] for id, record in patient_info.iteritems(): identified_type = False for stroke_type, value in record["stroke_type"].iteritems(): if (value == "1"): identified_type = True break if (not identified_type): record_ids_with_unidentified_stroke_type.append(id) # Print some stats on the acquired patient information. num_no_year = 0 num_no_dag = 0 for record_id, record in patient_info.iteritems(): if (record["dag"] == ""): num_no_dag += 1 if (not "enroll_date" in record): num_no_year += 1 print "Number of duplicated record IDs:", len(record_ids) - len(set(record_ids)) print "Number of unique record IDs:", len(set(record_ids)) print "Number of record IDs in patient_info:", len(patient_info) print "Number of records with no DAG:", num_no_dag print "Number of records with no enrolment date:", num_no_year print "Number of records with unidentified stroke type:", len(record_ids_with_unidentified_stroke_type) return patient_info