Пример #1
0
def buildProjects(config):
    #### Read user's settings.yml file, which will be used to get API tokens and URLs.
    api_settings = ApiSettings()

    ## Build a list of "projects" - dicts which store data and settings for the project.
    projects = config["projects"]

    ## Verify the settings for each project.
    for project in projects:
        code_name = project["code_name"]

        # Get args to pass to exportRecords.
        if (not "exportRecords_args"
                in project) or (project["exportRecords_args"] is None):
            project["exportRecords_args"] = {}

        # If use_getIPSSIDs is True, get list of record IDs to export.
        if project["options"]["use_getIPSSIDs"]:
            # If use_getIPSSIDs is True, but no options provided, raise warning.
            if (not "getIPSSIDs_args"
                    in project) or (project["getIPSSIDs_args"] is None):
                print "Warning: in project '" + code_name + "', 'use_getIPSSIDs' is True, but 'getIPSSIDs_args' not provided for project. Exporting all record IDs from project."
                record_id_list = None
            else:
                getIPSSIDs_args = project["getIPSSIDs_args"]
                record_id_list = getIPSSIDs(**getIPSSIDs_args)

            # If exportRecords_args has an entry for record_id_list, but use_getIPSSIDs is True, raise warning.
            if (project["options"]["use_getIPSSIDs"]) and (
                    "record_id_list" in project["exportRecords_args"]):
                print "Warning: in project '" + code_name + "', the specified 'record_id_list' will be ignored, since 'use_getIPSSIDs' is True."

            # Overwrite the record_id_list argument in exportRecords_args
            project["exportRecords_args"]["record_id_list"] = record_id_list

        ## Get args to pass to exportRecords. If key does not exist, or it is not set to a value, set it to an empty dict (i.e.
        exportRecords_args = project[
            "exportRecords_args"]  # has a value (possibly {}).

        # Convert exportRecords_args arguments to strings as needed.
        convert_to_strings = ["fields", "forms", "events", "record_id_list"]
        for arg in convert_to_strings:
            if arg in exportRecords_args.keys():
                if (exportRecords_args[arg] == 'None'
                    ):  # these arguments could be lists or None
                    # Convert string 'None' to Python None.
                    exportRecords_args[arg] = None
                else:
                    # Convert list to list of strings. Currently, list might contain integers etc.
                    new_list = [str(val) for val in exportRecords_args[arg]]
                    exportRecords_args[arg] = new_list

        ## Get API credentials for current project.
        api_url, api_key, code_name = api_settings.getApiCredentials(
            code_name=code_name)
        project["api_url"] = api_url
        project["api_key"] = api_key

        ## Export requested data for current project
        data_csv = exportRecords(api_url,
                                 api_key,
                                 format="csv",
                                 **exportRecords_args)
        data_csv_file = StringIO(data_csv)
        data_df = pandas.read_csv(data_csv_file,
                                  dtype=unicode,
                                  encoding='utf-8').fillna('')

        project["chunks"] = [
            data_df
        ]  # this list of dataframes will be broken into pieces, each piece containing data to be placed in a different tab.

        ## Retrieve project settings and add them to the dict for the current project
        pycap_project = redcap.Project(api_url, api_key)
        def_field = pycap_project.def_field
        project_info = exportProjectInfo(api_url, api_key)
        longitudinal = bool(project_info["is_longitudinal"])
        repeating = bool(project_info["has_repeating_instruments_or_events"])
        events = getEvents(api_url, api_key, quiet=True)
        metadata_raw = pycap_project.export_metadata()
        form_event_mapping = exportFormEventMapping(pycap_project,
                                                    longitudinal)
        repeating_forms_events = exportRepeatingFormsEvents(
            api_url, api_key, repeating)
        forms = exportFormsOrdered(api_url, api_key)
        form_repetition_map = createFormRepetitionMap(longitudinal, repeating,
                                                      form_event_mapping,
                                                      repeating_forms_events,
                                                      forms)
        metadata = parseMetadata(pycap_project.def_field,
                                 project_info,
                                 longitudinal,
                                 repeating,
                                 events,
                                 metadata_raw,
                                 form_event_mapping,
                                 repeating_forms_events,
                                 forms,
                                 form_repetition_map,
                                 write_branching_logic_function=False)

        project["pycap_project"] = pycap_project
        project["def_field"] = def_field
        project["project_info"] = project_info
        project["longitudinal"] = longitudinal
        project["repeating"] = repeating
        project["events"] = events
        project["form_event_mapping"] = form_event_mapping
        project["repeating_forms_events"] = repeating_forms_events
        project["forms"] = forms
        project["form_repetition_map"] = form_repetition_map
        project["metadata"] = metadata

        # Create dict which maps each form to a list of events containing that form.
        if longitudinal:
            form_to_events_dict = {}
            for form_event_entry in form_event_mapping:
                form = form_event_entry['form']
                event = form_event_entry['unique_event_name']
                if (not form in form_to_events_dict):
                    form_to_events_dict[form] = [event]
                else:
                    form_to_events_dict[form].append(event)
        else:
            form_to_events_dict = None
        project["form_to_events_dict"] = form_to_events_dict

        ## Build lists of variables which appear in the export data.
        # columns which uniquely identify a row
        primary_key = [def_field]
        if project["longitudinal"]:
            primary_key.append("redcap_event_name")
        if project["repeating"]:
            primary_key.append("redcap_repeat_instrument")
            primary_key.append("redcap_repeat_instance")
        project["primary_key"] = primary_key

        primary_key_and_dag = primary_key
        if ("redcap_data_access_group" in data_df.columns):
            primary_key_and_dag.append("redcap_data_access_group")
        project["primary_key_and_dag"] = primary_key_and_dag

        # form_complete fields
        form_complete_fields = [
            field for field in data_df.columns
            if ((field.endswith("_complete")) and (not field in metadata) and (
                not field in primary_key) and (
                    not field == "redcap_data_access_group"))
        ]
        project["form_complete_fields"] = form_complete_fields

        # data fields
        data_fields = [
            field for field in data_df.columns
            if ((not field in primary_key + form_complete_fields) and (
                not field == "redcap_data_access_group"))
        ]
        project["data_fields"] = data_fields

    return projects
Пример #2
0
                                                    project_repeating)
forms = exportFormsOrdered(api_url_ipss, api_key_ipss)
form_repetition_map = createFormRepetitionMap(project_longitudinal,
                                              project_repeating,
                                              form_event_mapping,
                                              repeating_forms_events, forms)
metadata = parseMetadata(def_field, project_info, project_longitudinal,
                         project_repeating, events, metadata_raw,
                         form_event_mapping, repeating_forms_events, forms,
                         form_repetition_map)

# Export records for non-registry-only patients.
#records_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids_arch, label=True)
records = exportRecords(api_url_ipss,
                        api_key_ipss,
                        record_id_list=record_ids,
                        label_overwrite=False,
                        label=True)
#records_post_2014 = exportRecords(api_url_ipss, api_key_ipss, record_id_list=record_ids_post_2014, label_overwrite=False, label=True)
records_post_2014 = [
    row for row in records if (row["ipssid"] in record_ids_post_2014)
]
#records_non_sk = exportRecords(api_url_ipss, api_key_ipss, record_id_list=record_ids_non_sk, label_overwrite=False, label=True)
records_non_sk = [
    row for row in records if (row["ipssid"] in record_ids_non_sk)
]
records_psom = exportRecords(api_url_psom,
                             api_key_psom,
                             record_id_list=record_ids_psom,
                             label_overwrite=False,
                             label=True)
Пример #3
0
def getPatientInfo(url_arch, url_ipss, key_arch, key_ipss):
    ## Get list of record IDs for each project. Exclude registry-only patients. Exclude patients with unknown stroke type.
    #    record_ids_arch = getRecordIDList(url_arch, key_arch)
    #    registry_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids_arch, fields=["registry"], events=["acute_arm_1"])
    #    for row in registry_arch:
    #        if (row["registry"] == "1"):
    #            record_ids_arch.remove(row["pk_patient_id"])
    #    record_ids_ipss = getRecordIDList(url_ipss, key_ipss)
    #    registry_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids_ipss, fields=["substud"], events=["acute_arm_1"])
    #    for row in registry_ipss:
    #        if (row["substud___8"] == "1"):
    #            record_ids_ipss.remove(row["ipssid"])
    #    record_ids_ipss_only = [record_id for record_id in record_ids_ipss if not record_id in record_ids_arch]
    #    for record_id in record_ids_arch:
    #        if (not record_id in record_ids_ipss):
    #            print "Record with ID", record_id, "in Archive, but not in IPSS"

    # Create one list of record ID which are non-registry and have known stroke type.
    record_ids = getIPSSIDs(inc_registry_only=False,
                            inc_unknown_stroke_type=False)

    ## Create dict with patient information: {record_id: {dag:"...", enroll_date:"...", ...} }
    patient_info = {}
    for record_id in record_ids:  # add item (another dict) for each patient in the Archive
        patient_info[record_id] = {}
#        patient_info[record_id]["in_arch"] = True
#        if (record_id in record_ids_ipss):
#            patient_info[record_id]["in_ipss"] = True # boolean describing presence of record in Archive
#        else:
#            patient_info[record_id]["in_ipss"] = False # boolean describing presence of record in IPSS
#    for record_id in record_ids_ipss_only: # add item (another dict) for each patient in the IPSS that has not yet been added.
#        patient_info[record_id] = {}
#        patient_info[record_id]["in_arch"] = False
#        patient_info[record_id]["in_ipss"] = True

## Get enrolment date for each record.
# Archive - Use 'dateofentry', then 'visit_date".
#    print "Project        : Archive"
    dateofentry_arch = exportRecords(url_arch,
                                     key_arch,
                                     record_id_list=record_ids,
                                     fields=["dateofentry"],
                                     events=["acute_arm_1"],
                                     validate=False)
    #    if (len(record_ids_arch) != len(dateofentry_arch)): # look for record id missing from exported data
    #        for record_id in record_ids_arch:
    #            id_in_data = False
    #            for row in dateofentry_arch:
    #                if (row["pk_patient_id"] == record_id):
    #                    id_in_data = True
    #                    break
    #            if (not id_in_data):
    #                print "Record with ID "+str(record_id)+" not found in exported data"
    #    num_missing = 0
    for row in dateofentry_arch:
        if (row["dateofentry"] == ""):
            #            num_missing += 1
            pass
        else:
            if ("enroll_date" in patient_info[row["pk_patient_id"]]):
                print "This record was counted twice: " + str(
                    row["pk_patient_id"])
                continue
            patient_info[row["pk_patient_id"]]["enroll_date"] = int(
                row["dateofentry"][:4])

    num_missing = len(
        [id for id in record_ids if (not "enroll_date" in patient_info[id])])

    #    print "Field used     : dateofentry"
    #    print "Number missing : ", num_missing

    record_ids_leftover = [
        id for id in record_ids if (not "enroll_date" in patient_info[id])
    ]
    visit_date_leftover = exportRecords(url_arch,
                                        key_arch,
                                        record_id_list=record_ids_leftover,
                                        fields=["visit_date"],
                                        events=["acute_arm_1"],
                                        validate=False)
    #    num_missing = 0
    for row in visit_date_leftover:
        if (row["visit_date"] == ""):
            #            num_missing += 1
            pass
        else:
            if ("enroll_date" in patient_info[row["pk_patient_id"]]):
                print "This record was counted twice: " + str(
                    row["pk_patient_id"])
                continue
            patient_info[row["pk_patient_id"]]["enroll_date"] = int(
                row["visit_date"][:4])
    num_missing = len(
        [id for id in record_ids if (not "enroll_date" in patient_info[id])])

    #    print "Field used     : visit_date"
    #    print "Number missing : ", num_missing

    # IPSS - use 'dateentered' (works for all but 6 patients).
    #    print
    #    print "Project        : IPSS"
    record_ids_leftover = [
        id for id in record_ids if (not "enroll_date" in patient_info[id])
    ]
    dateentered_ipss = exportRecords(url_ipss,
                                     key_ipss,
                                     record_id_list=record_ids_leftover,
                                     fields=["dateentered"],
                                     events=["acute_arm_1"],
                                     validate=False)
    #    if (len(record_ids_ipss_only) != len(dateentered_ipss)): # look for record id missing from exported data
    #        for record_id in record_ids_ipss_only:
    #            id_in_data = False
    #            for row in dateentered_ipss:
    #                if (row["ipssid"] == record_id):
    #                    id_in_data = True
    #                    break
    #            if (not id_in_data):
    #                print "Record with ID "+str(record_id)+" not found in exported data"
    #    num_missing = 0
    for row in dateentered_ipss:
        if (row["dateentered"] == ""):
            #            num_missing += 1
            pass
        else:
            if ("enroll_date" in patient_info[row["ipssid"]]):
                print "This record was counted twice: " + str(row["ipssid"])
                continue
            patient_info[row["ipssid"]]["enroll_date"] = int(
                row["dateentered"][:4])
    num_missing = len(
        [id for id in record_ids if (not "enroll_date" in patient_info[id])])
    #    print "Field used     : dateentered"
    #    print "Number missing : ", num_missing

    enroll_dates = set()
    for id, info in patient_info.iteritems():
        if ('enroll_date' in info):
            enroll_dates.add(info['enroll_date'])
            if (not info['enroll_date'] in range(2003, 2020)):
                print "Record enroll date outside [2003, 2019]:", id
        else:
            print "Record with no enrolment date:", id
#    print "enroll_dates:", sorted(list(enroll_dates))

## Get DAG for each record:
    dags_arch = exportRecords(url_arch,
                              key_arch,
                              record_id_list=record_ids,
                              fields=["pk_patient_id"],
                              validate=False)
    dags_ipss = exportRecords(url_ipss,
                              key_ipss,
                              record_id_list=record_ids,
                              fields=["ipssid"],
                              validate=False)
    for row in dags_arch:
        record_id = row["pk_patient_id"]
        dag = row["redcap_data_access_group"]
        patient_info[record_id]["dag"] = dag
    for row in dags_ipss:
        record_id = row["ipssid"]
        dag = row["redcap_data_access_group"]
        if (not "dag" in patient_info[record_id]) or (
                patient_info[record_id]["dag"]
                == ""):  # add DAG from IPSS if not added already
            patient_info[record_id][
                "dag"] = dag  # overwriting DAG for records in Archive should not be a problem.

#    for id in patient_info:
#        if (not "dag" in patient_info[id]) or (patient_info[id]["dag"] == ""):
#            print "Record with ID", id, "does not have a DAG assigned"

## Get stroke type for each patient. # Need to decide how we want to break this down further.
#    stroke_type_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids_arch, fields=["ais", "csvt", "pperi", "preart", "other_stroke", "age_at_event"], events=["acute_arm_1"])
    stroke_type_ipss = exportRecords(url_ipss,
                                     key_ipss,
                                     record_id_list=record_ids,
                                     fields=[
                                         "chais", "chcsvt", "neoais",
                                         "neocsvt", "ppis", "ppcsvt", "pvi",
                                         "preart", "othcond"
                                     ],
                                     events=["acute_arm_1"])

    for record_id in patient_info:
        patient_info[record_id]["stroke_type"] = {}
        patient_info[record_id]["stroke_type"]["neo_ais"] = "2"
        patient_info[record_id]["stroke_type"]["neo_csvt"] = "2"
        patient_info[record_id]["stroke_type"]["child_ais"] = "2"
        patient_info[record_id]["stroke_type"]["child_csvt"] = "2"
        patient_info[record_id]["stroke_type"]["pp_ais"] = "2"
        patient_info[record_id]["stroke_type"]["pp_csvt"] = "2"
        patient_info[record_id]["stroke_type"]["pp_vi"] = "2"
        patient_info[record_id]["stroke_type"]["art"] = "2"
        patient_info[record_id]["stroke_type"]["other"] = "2"

    for row in stroke_type_ipss:  # 0 - no, 1 - yes, 2 - unknown
        record_id = row["ipssid"]
        # neonatal AIS
        patient_info[record_id]["stroke_type"]["neo_ais"] = row["neoais___1"]
        # neonatal CSVT
        patient_info[record_id]["stroke_type"]["neo_csvt"] = row["neocsvt___1"]
        # child AIS
        patient_info[record_id]["stroke_type"]["child_ais"] = row["chais___1"]
        # child CSVT
        patient_info[record_id]["stroke_type"]["child_csvt"] = row[
            "chcsvt___1"]
        # presumed perinatal AIS
        patient_info[record_id]["stroke_type"]["pp_ais"] = row["ppis___1"]
        # presumed perinatal CSVT
        patient_info[record_id]["stroke_type"]["pp_csvt"] = row["ppcsvt___1"]
        # presumed perinatal VI
        patient_info[record_id]["stroke_type"]["pp_vi"] = row["pvi___1"]
        # arteriopathy
        patient_info[record_id]["stroke_type"]["art"] = row["preart___1"]
        # other
        patient_info[record_id]["stroke_type"]["other"] = row["othcond___1"]

    # Look for patients without an identified stroke type.
    record_ids_with_unidentified_stroke_type = []
    for id, record in patient_info.iteritems():
        identified_type = False
        for stroke_type, value in record["stroke_type"].iteritems():
            if (value == "1"):
                identified_type = True
                break
        if (not identified_type):
            #            print "Record with ID", id, "has an unidentified stroke type."
            record_ids_with_unidentified_stroke_type.append(id)

    # Check if stroke type can be identified in Archive instead.
#    stroke_type_arch_leftover = exportRecords(url_arch, key_arch, record_id_list=record_ids_with_unidentified_stroke_type, fields=["ais", "csvt", "pperi", "preart", "other_stroke", "age_at_event"], events=["acute_arm_1"])
#    for row in stroke_type_arch_leftover:
#        print row["pk_patient_id"], row["ais"], row["csvt"], row["pperi"], row["preart"], row["other_stroke"]#, row["age_at_event"]
#        stroke_type_found = False
#        if (row["ais"] == "1") and (row["age_at_event"] == "0"):
#            patient_info[row["pk_patient_id"]]["stroke_type"]["neo_ais"] = "1"
#            stroke_type_found = True
#        if (row["csvt"] == "1") and (row["age_at_event"] == "0"):
#            patient_info[row["pk_patient_id"]]["stroke_type"]["neo_csvt"] = "1"
#            stroke_type_found = True
#        if (row["ais"] == "1") and (row["age_at_event"] == "1"):
#            patient_info[row["pk_patient_id"]]["stroke_type"]["child_ais"] = "1"
#            stroke_type_found = True
#        if (row["csvt"] == "1") and (row["age_at_event"] == "1"):
#            patient_info[row["pk_patient_id"]]["stroke_type"]["child_csvt"] = "1"
#            stroke_type_found = True
#        if (row["preart"] == "1"):
#            patient_info[row["pk_patient_id"]]["stroke_type"]["art"] = "1"
#            stroke_type_found = True
#        if (row["other_stroke"] == "1"):
#            patient_info[row["pk_patient_id"]]["stroke_type"]["other"] = "1"
#            stroke_type_found = True
#        if stroke_type_found:
#            record_ids_with_unidentified_stroke_type.remove(row["pk_patient_id"])

# Print some stats on the acquired patient information.
    num_no_year = 0
    num_no_dag = 0
    for record_id, record in patient_info.iteritems():
        if (record["dag"] == ""):
            num_no_dag += 1
        if (not "enroll_date" in record):
            num_no_year += 1
    print "Number of duplicated record IDs:", len(record_ids) - len(
        set(record_ids))
    print "Number of unique record IDs:", len(set(record_ids))
    print "Number of record IDs in patient_info:", len(patient_info)
    print "Number of records with no DAG:", num_no_dag
    print "Number of records with no enrolment date:", num_no_year
    print "Number of records with unidentified stroke type:", len(
        record_ids_with_unidentified_stroke_type)
    return patient_info
Пример #4
0
def reportPatientInfo(patient_info, out_dir, path_dag_info):
    ## Miscellaneous items used in all of the enrolment reports
    min_year = 2003
    max_year = 2020  #2019
    year_list = range(min_year, max_year + 1)

    records_ipss = exportRecords(url_ipss, key_ipss, fields=["ipssid"])
    dags = getDAGs(records_ipss)[1]
    # Put "Unassigned" at end of list.
    dags_old = dags
    dags = sorted(dags_old)[1:]
    dags.extend(sorted(dags_old)[:1])

    # Check if all records belong to one of the DAGs in the list just created.
    for record_id, record in patient_info.iteritems():
        if (not record["dag"] in dags):
            print "Record with ID", record_id, "in DAG", record[
                dag], "is part of unidentified DAG."

    # Enrolment by site per year
    report_path = os.path.join(out_dir, "enrolment_dag.csv")

    # Write row/column headings
    columns = year_list
    index = [dag if (dag != "") else "Unassigned" for dag in dags]

    # Create pandas DataFrame to store report.
    report_df = pandas.DataFrame(columns=columns, index=index)

    # Add row for each DAG.
    for dag in dags:
        if (dag != ""):
            dag_name = dag
        else:
            dag_name = "Unassigned"
        for year in year_list:
            num_enrolled_dag_year = 0
            for record_id, record in patient_info.iteritems():
                if ("enroll_date" in record) and (type(record["enroll_date"])
                                                  != type(year)):
                    print "WARNING: comparison of different types in 'enroll_date'."
                if (record["dag"] == dag) and ("enroll_date" in record) and (
                        record["enroll_date"] == year):
                    num_enrolled_dag_year += 1
            report_df[year][dag_name] = num_enrolled_dag_year

    # Add columns/rows to store column/row totals.
    report_df["Total"] = report_df.sum(axis=1).astype(int)  # Total column
    report_df = report_df.append(
        report_df.sum(axis=0).astype(int).rename("Total"))  # Total row

    # Add instition name and country columns to dataframe.
    report_df = addDAGInfo(report_df, path_dag_info)

    report_df.to_csv(report_path)
    print report_df

    ## Enrolment by stroke type per year
    report_path = os.path.join(out_dir, "enrolment_stroke_type.csv")

    # Write row/column headings
    columns = year_list
    index = [
        "Neonatal AIS", "Neonatal CSVT", "Neonatal AIS & CSVT",
        "Childhood AIS", "Childhood CSVT", "Childhood AIS & CSVT",
        "Presumed perinatal AIS", "Presumed perinatal CSVT",
        "Presumed perinatal AIS & CSVT", "Presumed perinatal VI",
        "Arteriopathy", "Other"
    ]

    report_df = pandas.DataFrame(0, columns=columns, index=index)

    # Add each patient with known stroke type to report.
    for id, record in patient_info.iteritems():
        if ("enroll_date" in record) and (
                record["enroll_date"] in columns
        ):  # If enrolment date is known and included in the report.
            year = record["enroll_date"]
            if (record["stroke_type"]["neo_ais"]
                    == "1") and (record["stroke_type"]["neo_csvt"] == "1"):
                report_df[year]["Neonatal AIS & CSVT"] += 1
            elif (record["stroke_type"]["neo_ais"] == "1"):
                report_df[year]["Neonatal AIS"] += 1
            elif (record["stroke_type"]["neo_csvt"] == "1"):
                report_df[year]["Neonatal CSVT"] += 1
            elif (record["stroke_type"]["child_ais"]
                  == "1") and (record["stroke_type"]["child_csvt"] == "1"):
                report_df[year]["Childhood AIS & CSVT"] += 1
            elif (record["stroke_type"]["child_ais"] == "1"):
                report_df[year]["Childhood AIS"] += 1
            elif (record["stroke_type"]["child_csvt"] == "1"):
                report_df[year]["Childhood CSVT"] += 1
            elif (record["stroke_type"]["pp_ais"]
                  == "1") and (record["stroke_type"]["pp_csvt"] == "1"):
                report_df[year]["Presumed perinatal AIS & CSVT"] += 1
            elif (record["stroke_type"]["pp_ais"] == "1"):
                report_df[year]["Presumed perinatal AIS"] += 1
            elif (record["stroke_type"]["pp_csvt"] == "1"):
                report_df[year]["Presumed perinatal CSVT"] += 1
            elif (record["stroke_type"]["pp_vi"] == "1"):
                report_df[year]["Presumed perinatal VI"] += 1
            elif (record["stroke_type"]["art"] == "1"):
                report_df[year]["Arteriopathy"] += 1
            elif (record["stroke_type"]["other"] == "1"):
                report_df[year]["Other"] += 1

    report_df["Total"] = report_df.sum(axis=1).astype(int)  # Total column
    report_df = report_df.append(
        report_df.sum(axis=0).astype(int).rename("Total"))  # Total row
    report_df.to_csv(report_path)
    print report_df

    return
Пример #5
0
def transferPSOMToIPSS(url_psom,
                       key_psom,
                       url_ipss,
                       key_ipss,
                       import_non_ipss_ids=False,
                       out_path=None,
                       manual_import=False):
    """
    Transfer Summary of Impressions data from PSOM V2 to IPSS V4.
    Parameters:
        url_psom: str, API URL for PSOM V2
        key_psom: str, API key for PSOM V2
        url_ipss: str, API URL for IPSS V4
        key_ipss: str, API key for IPSS V4
        import_non_ipss_ids: bool, whether to import IDs that do not exist in IPSS V4
        out_path: str, path to save data to be imported to IPSS to
        manual_import: bool, do not import data to IPSS; data saved to CSV can be manually imported by user
    Returns:
        None
    """

    ## Define function which modifies PSOM data prior to import to IPSS.
    def modifyRecords(from_psom,
                      url_ipss,
                      key_ipss,
                      import_non_ipss_ids=False):
        """
        Take data exported from PSOM V2, and modify it such that it can be imported to IPSS V4.
        Parameters:
            from_psom: list of dicts; all records in the PSOM V2 database.
        Returns:
            to_ipss: list of dicts; all PSOM data to be imported into IPSS V3, after changing variable, event names etc.
        """

        # In PSOM V2:
        # - The Summary of Impressions assessment is included only in the 'summary_of_impressions' instrument, which is part of the following events:
        #     - 'acute_hospitalizat_arm_1' (repeating event) - all data collected during initial hospitalization
        #     - 'initial_psom_arm_1' (non repeating) - data collected during first "initial PSOM" which occurred outside of the initial hospitalization
        #     - 'follow_up_psom_arm_1' (repeating event) - all subsequently collected data.
        # In IPSS V4:
        # - The Summary of Impressions assessment is found in the instrument:
        #     - 'summary_of_impressions' (non-repeating in event 'acute_arm_1'; repeating instrument in event 'followup_arm_1')
        #
        # Link PSOM to IPSS as follows:
        # - Map the instance of the PSOM V2 'summary_of_impressions' form which has the latest 'fuionset_soi' date in the 'acute_hospitalizat_arm_1' event to the IPSS V4 'summary_of_impressions' form in event 'acute_arm_1'.
        #   - SOIs in the 'acute_hospitalizat_arm_1' event with earlier 'fuionset_soi' dates will not be mapped to IPSS V4.
        # - Map the PSOM V2 'summary_of_impressions' form in the 'initial_psom_arm_1' and 'follow_up_psom_arm_1' events to the IPSS V4 'summary_of_impression" form in event 'followup_arm_1'.
        #   - Order the instances in IPSS V4 according to ascending 'fuionset_soi' dates in PSOM V2.
        # - All PSOM V2 summary of impressions (from any event), which have a blank 'fuionset_soi' date will be excluded from IPSS V4.

        #### Perform record-specific modifications to a few records. Ideally, there will be nothing in this section.

        #### Remove data from PSOM which will not be imported into IPSS V4.
        ## Remove all rows (any event) which do not have a PSOM assessment date.
        from_psom_after_exclusions = []
        for row in from_psom:
            soi_date = row['fuionset_soi']
            if (soi_date.strip() !=
                    ''):  # if the summary of impressions date is not blank
                from_psom_after_exclusions.append(row)
        from_psom = from_psom_after_exclusions

        ## Remove all rows for records which do not exist in IPSS V4.
        if (not import_non_ipss_ids):
            from_psom_after_exclusions = []
            excluded_ids = set(
            )  # set of IPSSIDs which exist in PSOM, but not in IPSS, and will not be imported.
            record_ids_ipss = getRecordIDList(
                url_ipss, key_ipss)  # list of all records in IPSS.
            for row in from_psom:
                id = row['ipssid']
                if (id in record_ids_ipss):
                    from_psom_after_exclusions.append(row)
                elif (
                        not id in excluded_ids
                ):  # if excluded ID has not already been identified and a warning printed.
                    #warnings.warn("IPSSID not found in IPSS, not importing this patient's data: " + id)
                    excluded_ids.add(id)
            from_psom = from_psom_after_exclusions

        #### Create dictionaries which map IPSSIDs to row numbers in PSOM for (a) the 'acute_hospitalizat_arm_1' event; and (b) the 'initial_psom_arm_1' and 'followup_arm_psom_arm_1' events. These dictionaries are used to determine which rows in PSOM V2 will be mapped to which (instances of which) events in IPSS V4.
        ## Create dictionary for the 'acute_hospitalizat_arm_1' event. Take only the acute_hospitalization row with the latest non-blank 'fuionset_soi' date.
        acute_dict = {}
        for row_index in range(len(from_psom)):
            row = from_psom[row_index]
            id = row['ipssid']
            if (row['redcap_event_name'] == 'acute_hospitalizat_arm_1'
                ):  # if row corresponds to 'acute_hospitalizat_arm_1' event
                psom_instance = row['redcap_repeat_instance']
                psom_date = int(
                    row['fuionset_soi'].replace('-', '')
                )  # Convert the string '2003-04-05' to the integer 20030405 for comparison later.
                if (
                        not id in acute_dict.keys()
                ):  # if there is not yet an entry for this ID in acute_dict
                    acute_dict[id] = (row_index, psom_instance, psom_date)
                elif (
                        psom_date > acute_dict[id][2]
                ):  # if ID already in acute_dict and row corresponds to a more recent acute hospitalization instance
                    acute_dict[id] = (row_index, psom_instance, psom_date)

        ## Create dictionary for the 'initial_psom_arm_1' and 'followup_psom_arm_1' events.
        followup_dict = {}
        for row_index in range(len(from_psom)):
            row = from_psom[row_index]
            id = row['ipssid']
            psom_date = int(row['fuionset_soi'].replace('-', ''))
            if (row['redcap_event_name'] == 'initial_psom_arm_1'):
                followup_dict[id] = [
                    (row_index, 0, psom_date)
                ]  # Assign a fake instance number of 0 to the initial_psom_arm_1 event (this was used in the old method of ordering based on instance number; now order based on 'fuionset_soi').
        for row_index in range(len(from_psom)):
            row = from_psom[row_index]
            id = row['ipssid']
            psom_instance = row['redcap_repeat_instance']
            psom_date = int(row['fuionset_soi'].replace('-', ''))
            if (row['redcap_event_name'] == 'follow_up_psom_arm_1'):
                if (not id in followup_dict.keys()):
                    followup_dict[id] = [(row_index, psom_instance, psom_date)]
                else:
                    followup_dict[id].append(
                        (row_index, psom_instance, psom_date))

        ## Reorder the lists of (row index, PSOM instance) tuples in order of ascending PSOM instance number, so that the correct order will be retained in IPSS.
        for id, row_tuple_list_psom in followup_dict.iteritems():
            row_tuple_list_psom.sort(key=lambda list_element: list_element[
                2])  # Sort list of tuples using the 'fuionset_soi' values.

        ## Check that follow-up rows are arranged in order of ascending 'fuisonset_soi'. Raise AssertionError if this is not true. This section has no effect on the data; it just checks for errors.
        for id, row_tuple_list_psom in followup_dict.iteritems():
            last_date = 0  # fake date to compare the first date to.
            for row_tuple_psom in row_tuple_list_psom:
                current_date = row_tuple_psom[2]
                assert (current_date >= last_date)
                last_date = current_date

        #### Create functions and dictionaries for field mappings.
        ## Create a dictionary for Summary of Impressions (PSOM) -> 'summary_of_impressions' (IPSS V4)). The dictionary is of the form {field_name_in_PSOM: field_name_in_IPSS}. This dictionary only includes fields which are directly mapped to a corresponding IPSS V4 field. Fields which are modified prior to transfer are dealt with separately.
        psom_to_ipss_soi = {
            'fuionset_soi': 'psomdate',
            'fpsomr': 'psomr',
            'fpsoml': 'psoml',
            'fpsomlae': 'psomlae',
            'fpsomlar': 'psomlar',
            'fpsomcb': 'psomcb',
            'psomsen___1': 'psomsens___3',
            'psomsen___2': 'psomsens___4',
            'psomsen___3': 'psomsens___5',
            'psomsen___4': 'psomsens___6',
            'psomsen___5': 'psomsens___7',
            'psomsen___6': 'psomsens___8',
            'psomsen___7': 'psomsens___9',
            'psomsen___8': 'psomsens___10',
            'psomsen___9': 'psomsens___11',
            'psomsen___10': 'psomsens___12',
            'psomsen___11': 'psomsens___13',
            'psomsen___12': 'psomsens___14',
            'othsens': 'senssp',
            'fpsomco___1': 'psomcog___1',
            'fpsomco___2': 'psomcog___2',
            'totpsom': 'psomscr',
            'summary_of_impressions_complete':
            'summary_of_impressions_complete'
        }

        ## Create functions which perform the many-to-one mappings.
        def combineComments(row_psom):
            """
            Combine multiple comment fields in PSOM into a single string to be imported into a single IPSS field.
            Parameters:
                row_psom: list of dicts, row in PSOM.
            Returns:
                combined_comments: str, value to be imported into a single IPSS field.
            """

            ## Initialize the combined comments field.
            combined_comments = ''

            ## Add text from the PSOM 'lang_pro_det' (text) field if it is nonempty.
            if (row_psom['lang_pro_det'].strip() != ''):
                combined_comments += 'Language production deficits: ' + row_psom[
                    'lang_pro_det'] + '. '

            ## Add text from the PSOM 'lang_comp_det' (text) field if it is nonempty.
            if (row_psom['lang_comp_det'].strip() != ''):
                combined_comments += 'Language comprehension deficits: ' + row_psom[
                    'lang_comp_det'] + '. '

            ## Add text from the PSOM 'cog_beh_det' (checkbox) field if any options are checked.
            # Create dictionary mapping checkbox number to checkbox option text.
            cog_beh_det_dict = {
                1: 'Remembering what he/she learned',
                2: 'Staying focused',
                3: 'Sad or low moods',
                4: 'Excessive worries',
                5: 'Getting along with other children',
                6: 'Other'
            }

            # Keep note of whether any of the checkboxes are checked for this row.
            any_checked = False

            # Loop over the checkbox option numbers.
            for box_number in range(1, 6 + 1):
                box_var = 'cog_beh_det___' + str(
                    box_number)  # field name for current checkbox option
                if (row_psom[box_var] == '1'):  # if checkbox is checked
                    if (
                            not any_checked
                    ):  # if this is the first checked option found for the current row
                        combined_comments += 'Cognitive/behavioural deficits: '
                        any_checked = True
                    combined_comments += cog_beh_det_dict[box_number] + ', '

            # Replace trailing comma with trailing period
            if (combined_comments[-2:] == ', '):
                combined_comments = combined_comments[:-2] + '. '

            ## Add text from the PSOM 'cbcomm' (text) field if it is nonempty.
            if (row_psom['cbcomm'].strip() != ''):
                combined_comments += 'Other cognitive/behavioural deficits: ' + row_psom[
                    'cbcomm'] + '. '

            ## Add text from the PSOM 'stroke_cause_y_n' (yesno) field if it is nonempty.
            if (row_psom['stroke_cause_y_n'] != ''):
                combined_comments += 'Are all neurologic deficits attributable to stroke?: '
                if (row_psom['stroke_cause_y_n'] == '1'):
                    combined_comments += 'Yes. '
                else:
                    combined_comments += 'No. '

            ## Add text from the PSOM 'cause_det' (notes) field if it is nonempty.
            if (row_psom['cause_det'].strip() != ''):
                combined_comments += 'Specify which deficits are not attributable to stroke, and state responsible diagnosis: ' + row_psom[
                    'cause_det']

            ## Strip trailing comma.
            if (combined_comments[-2:] in [', ', '. ']):
                combined_comments = combined_comments[:-2]

            return combined_comments

        #### Build the import data for IPSS V4 using the IPPSID-to-row-number mappings.
        ## Initialize data to be imported into IPSS V4.
        to_ipss = []

        ## Map data to IPSS 'acute_arm_1' event.
        for id, row_tuple_psom in acute_dict.iteritems(
        ):  # Loop over IPSSIDs which have at least one 'acute_hospitalizat_arm_1' event in PSOM V2.
            row_index_psom = row_tuple_psom[0]
            row_psom = from_psom[
                row_index_psom]  # PSOM row to be imported into IPSS.
            assert (
                row_psom['redcap_event_name'] == 'acute_hospitalizat_arm_1'
            )  # Check that PSOM row corresponds to the appropriate PSOM event.

            # Initialize the row to be imported into IPSS.
            row_ipss = {
                'ipssid': id,
                'redcap_event_name': 'acute_arm_1',
                'redcap_repeat_instrument': '',
                'redcap_repeat_instance': ''
            }

            # Add the variables with a one-to-one mapping.
            for field_name_psom, field_name_ipss in psom_to_ipss_soi.iteritems(
            ):
                value = row_psom[field_name_psom]
                row_ipss[field_name_ipss] = value

            # Add the variables with a many-to-one mapping
            sdcom = combineComments(row_psom)
            row_ipss['sdcom'] = sdcom

            # Append row to IPSS data.
            to_ipss.append(row_ipss)

        ## Map data to IPSS 'followup_arm_1' evemt. Note that the followup rows have already been ordered based on fuionset_soi at this point.
        for id, row_tuple_list_psom in followup_dict.iteritems():
            instance_ipss = 1  # instance number for current row in IPSS
            for row_tuple_psom in row_tuple_list_psom:
                row_index_psom = row_tuple_psom[0]
                row_psom = from_psom[row_index_psom]
                assert (
                    row_psom['redcap_event_name'] != 'acute_hospitalizat_arm_1'
                )  # Check that PSOM row corresponds to the appropriate PSOM events.

                # Initialize the row to be imported into IPSS.
                row_ipss = {
                    'ipssid': id,
                    'redcap_event_name': 'followup_arm_1',
                    'redcap_repeat_instrument': 'summary_of_impressions',
                    'redcap_repeat_instance': str(instance_ipss)
                }

                # Add the variables with a one-to-one mapping.
                for field_name_psom, field_name_ipss in psom_to_ipss_soi.iteritems(
                ):
                    value = row_psom[field_name_psom]
                    row_ipss[field_name_ipss] = value

                # Add the variables with a many-to-one mapping
                sdcom = combineComments(row_psom)
                row_ipss['sdcom'] = sdcom

                ## Append row to IPSS data.
                to_ipss.append(row_ipss)

                ## Increment the IPSS instance number
                instance_ipss += 1

        return to_ipss

    ## Export Summary of Impressions data from PSOM.
    from_psom = exportRecords(url_psom,
                              key_psom,
                              fields=None,
                              forms=None,
                              quiet=True,
                              export_form_completion=True)

    ## Map the PSOM data to IPSS fields.
    to_ipss = modifyRecords(from_psom,
                            url_ipss,
                            key_ipss,
                            import_non_ipss_ids=import_non_ipss_ids)

    ## Save data to be imported to a CSV file.
    if out_path:
        saveToCsv(to_ipss, out_path)

    ## Import data to IPSS.
    if manual_import:
        print "Skipping automatic import of data. Data to be imported into IPSS V4 was saved to '" + out_path + "'. This file should be imported with the setting \"Allow blank values to overwrite existing saved values?\" set to \"Yes\"."
    else:
        importRecords(url_ipss,
                      key_ipss,
                      to_ipss,
                      overwrite='overwrite',
                      quick=True,
                      return_content='count')

    return
Пример #6
0
    forms_list[project_index] = exportFormsOrdered(api_url_list[project_index], api_key_list[project_index])
    

    # Generate a dictionary with form_names as keys; each entry is a dict specifying in which 
    # events the form is non-repeating, indpendently repeating, or dependently repeating.
    form_repetition_map_list[project_index] = createFormRepetitionMap(project_longitudinal_list[project_index], project_repeating_list[project_index], form_event_mapping_list[project_index], repeating_forms_events_list[project_index], forms_list[project_index])
    
    
    # Gather data about each variable.
    metadata_list[project_index] = parseMetadata(def_field_list[project_index], project_info_list[project_index], project_longitudinal_list[project_index], project_repeating_list[project_index], events_list[project_index], metadata_list[project_index], form_event_mapping_list[project_index], repeating_forms_events_list[project_index], forms_list[project_index], form_repetition_map_list[project_index])
    
    
    # Load all records.
#    if (project_index == 0): # USED BEFORE REVERSING ORDER OF PROJECT DATA RETRIEVAL
    if (project_index == 1):# USED AFTER REVERSING ORDER OF PROJECT DATA RETRIEVAL
        records_list[project_index] = exportRecords(api_url_list[project_index], api_key_list[project_index])
    else: 
        # Only pull record IDs from second project that exist in first project.
#        records_list[project_index] = exportRecords(api_url_list[project_index], api_key_list[project_index], record_id_list=[record_id for record_id in record_id_map_list[0]]) # USED BEFORE REVERSING ORDER OF PROJECT DATA RETRIEVAL
        # Only pull record IDs from first project that exist in second project.
        records_list[project_index] = exportRecords(api_url_list[project_index], api_key_list[project_index], record_id_list=[record_id for record_id in record_id_map_list[1]]) # USED AFTER REVERSING ORDER OF PROJECT DATA RETRIEVAL
    
    # Check for high-level issues in project settings, metadata, records.
    project_compatible_list[project_index] = isProjectCompatible(metadata_list[project_index], records_list[project_index], def_field_list[project_index])
    if (not project_compatible_list[project_index]):
        sys.exit()
    
    
    # Generate a non redundant list of record IDs. 
    record_id_map_list[project_index] = createRecordIDMap(def_field_list[project_index], records_list[project_index])
    
Пример #7
0
def mainIntraProject(config_path):
    config = readConfig(config_path)
    print "Performing checks with configuration:"
    pprint(config)
    print

    #### Read user's settings.yml file, which will be used to get API tokens and URLs.
    api_settings = ApiSettings(
    )  # Create instance of ApiSettings class. Use this to find file containing API keys and URLs.

    # Determine the API URL and API token based on the users input and api_keys.yml file.
    code_name = config["code_name"]
    api_url, api_key, code_name = api_settings.getApiCredentials(
        code_name=code_name)

    # Create output directory if it does not exist.
    out_dir = config["out_dir"]
    if (not os.path.isdir(out_dir)):
        os.mkdir(out_dir)
        print "Created directory:", out_dir

    # Define a list containing the lists of Check objects (defined in Check.py).
    check_name_list = config["checks"]

    check_paths_exist = True
    for check_name in check_name_list:
        scriptdir = os.path.dirname(os.path.realpath(__file__))
        check_path = os.path.join(scriptdir, check_name + ".py")
        if not os.path.exists(check_path):
            raise Exception("Path does not exist:", check_path)

    # Load REDCap project (a PyCap object).
    project = redcap.Project(api_url, api_key)

    # Get the field name of the unique identifying field (e.g. "ipssid").
    def_field = project.def_field

    # Load high-level projct information.
    project_info = exportProjectInfo(api_url, api_key)
    project_longitudinal = bool(project_info["is_longitudinal"])
    project_repeating = bool(
        project_info["has_repeating_instruments_or_events"])

    # Load list of events
    events = getEvents(api_url,
                       api_key)  #project, project_info, project_longitudinal)
    if (not events == None):
        print "Review the event_ids below. These are required for generating links to problematic data in reports. If these are incorrect, or unset, you can set them in the event_ids.yml file specified in your settings.yml file. You can find the event_id associated with an event by accessing data from that event online, and looking at the value of 'event_id' in the address bar."
        for event in events:
            event_id = events[event]["event_id"]
            if (not event_id == None):
                print Color.green + event + " " + event_id + Color.end
            else:
                print Color.red + event + " " + 'None' + Color.end
    print

    # Load raw data dictionary.
    metadata_raw = project.export_metadata()

    # Load instrument-event mapping
    form_event_mapping = exportFormEventMapping(project, project_longitudinal)

    # Load information specifying which forms are repeating.
    repeating_forms_events = exportRepeatingFormsEvents(
        api_url, api_key, project_repeating)

    # Generate list of forms - list of dicts with two keys: 'instrument_label' and 'instrument_name'
    forms = exportFormsOrdered(api_url, api_key)

    # Generate a dictionary with form_names as keys; each entry is a dict specifying in which
    # events the form is non-repeating, indpendently repeating, or dependently repeating.
    form_repetition_map = createFormRepetitionMap(project_longitudinal,
                                                  project_repeating,
                                                  form_event_mapping,
                                                  repeating_forms_events,
                                                  forms)

    # Gather data about each variable.
    metadata = parseMetadata(def_field, project_info, project_longitudinal,
                             project_repeating, events, metadata_raw,
                             form_event_mapping, repeating_forms_events, forms,
                             form_repetition_map)

    ## Load all records.
    if config["use_getIPSSIDs"]:
        getIPSSIDs_args = config["getIPSSIDs_args"]
        record_id_list = getIPSSIDs(**getIPSSIDs_args)
    elif config["use_custom_record_id_list"]:
        record_id_list = config["record_id_list"]
    else:
        record_id_list = None
    records = exportRecords(api_url, api_key, record_id_list)

    # Check for high-level issues in project settings, metadata, records.
    # 2020-05-11 - This script appears to check for bugged output of exportRecords.py, which has now been handled in exportRecords.py.
    #    project_compatible = isProjectCompatible(metadata, records, def_field)
    #    if (not project_compatible):
    #        raise Exception("Error found in records or metadata. Review output above.")

    # Generate a dictionary with record IDs as keys and a list of row numbers corresponding to that record as values.
    record_id_map = createRecordIDMap(def_field, records)

    # Generate a list of data access groups if they exist.
    dags_used, dags = getDAGs(records)

    # Generate a dictionary containing information about each dag (e.g. number of records they contain).
    dag_record_map = createDAGRecordMap(def_field, records, record_id_map,
                                        dags_used, dags)

    # Generate list of checks to perform (default & user-defined).
    checklist = createChecklist(check_name_list)

    # Perform checks on data and report issues.
    check_results = checkDriver(checklist, out_dir, def_field, forms,
                                project_info, project_longitudinal,
                                project_repeating, events, metadata,
                                form_event_mapping, repeating_forms_events,
                                form_repetition_map, records, record_id_map,
                                dags_used, dags, dag_record_map)

    #    # Save data exported from REDCap and generated in this script. The check results are saved by checkDriver() above.
    #    saveData(out_dir, project, forms, project_info, metadata, record_id_map, dags_used, dags, check_results)
    return
Пример #8
0
def getIPSSIDs(from_code_name="ipss_v4", ex_registry_only=False, ex_unknown_stroke_type=False, ex_pre_2003=False, ex_pre_2014=False, ex_post_20191001=False, ex_sk_patients=False, ex_neonatal_stroke=False, ex_placeholders=False, ex_adult_stroke=False, ex_melas=False, ex_non_ipss=False, ex_non_sips=False, ex_non_sips2=False, ex_non_sips2_cohort1=False, ex_non_sips2_cohort2=False, ex_sips_exclusions=False, ex_sips_exclusions_2=False, ex_patient_info_incomp=False, ex_core_incomplete=False, ex_non_vips_enrolled=False, ex_vips_screen_nonenroll=False):
    '''
    Parameters:
        from_code_name:            str - code_name of database to get IDs from. Allowed values are the code names defined in user's api_keys.yml file.
        ex_registry_only:          bool - whether to exclude IDs of SickKids registry-only patients based on IPSS V4 data
        ex_unknown_stroke_type:    bool - whether to exclude IDs of records with unknown stroke type based on IPSS V4 data
        ex_pre_2003:               bool - whether to exclude IDs of patients enrolled before 2003 based on IPSS V4 field dateentered
        ex_pre_2014:               bool - whether to exclude IDs of patients enrolled before 2014 based on IPSS V4 field originalipss
        ex_post_20191001           bool - whether to indlude IDs added on or after 2019-10-01 based on the IPSS V4 field dateentered
        ex_sk_patients:            bool - whether to exclude IDs of patients with 'hsc' data access group in IPSS V4
        ex_neonatal_stroke:        bool - whether to exclude IDs of patients who suffered a neonatal stroke based on IPSS V4 data
        ex_placeholders:           bool - whether to exclude IDs of patients with almost no real data, whose records likely exist as placeholders based on IPSS V4 data
        ex_adult_stroke:           bool - whether to exclude IDs of patients who did not suffer a stroke as a child based on IPSS V4 data
        ex_melas:                  bool - whether to exclude IDs patients with MELAS based on IPSS V4 data
        ex_non_ipss:               bool - whether to exclude IDs that do not exist in IPSS V4
        ex_non_sips:               bool - whether to exclude IDs that are not in SIPS I or SIPS II based on IPSS V4 data
        ex_non_sips2:              bool - whether to exclude IDs that are not in SIPS II based on IPSS V4 data
        ex_non_sips2_cohort1       bool - whether to exclude IDs that are not in SIPS II cohort I based on IPSS V4 data
        ex_non_sips2_cohort2       bool - whether to exclude IDs that are not in SIPS II cohort II based on IPSS V4 data
        ex_sips_exclusions:        bool - whether to exclude SIPS patients that were excluded merely screened based on SIPS II V2 field 'screened'
        ex_sips_exclusions_2:      bool - whether to exclude SIPS patients that were excluded based screened based on SIPS II V2 field 'screened' and 'actcohortsp'
        ex_patient_info_incomp     bool - whether to exclude IDs of patients for whom patient_information_complete != 2 in IPSS V2, *but still include all SLCH patients*.
        ex_core_incomplete         bool - whether to exclude IDs of patients for whom any of the IPSS V4 'core' forms are not marked as complete (core forms, here, include 'patient_information', 'cardiac_arteriopathy_risk_factors', 'other_child_and_neonatal_risk_factors', and 'clinical_presentation')
        ex_non_vips_enrolled:      bool - whether to exclude patients who are not enrolled in VIPS, based on the condition of the VIPS II field vscreen_sfoutc=4.
        ex_vips_screen_nonenroll:  bool - whether to exclude patients who are "VIPS screened, not enrolled" based on the IPSS V4 field 'vips_screened'

    Returns:
        record_ids:               list - record IDs in the specified database after specified exclusions
    '''

    ## Get list of exlusions requested (i.e. get a list of all args set to False).
    exclusion_args = []
    for key, val in locals().iteritems():
        if (val is True) and (key[:3] == 'ex_'):
            exclusion_args.append(key)

    ## Get API tokens and URLs for all projects used in the filters
    api_settings = ApiSettings() # Create instance of ApiSettings class. Use this to find file containing API keys and URLs.

    token_dict = {} # keys are code names, values are a tuple with (url, token) for that project
    for code_name in ["ipss_arch", "ipss_v4", "sips2_v2", "vips2", "psom_v2"]:
        url, key, code_name = api_settings.getApiCredentials(code_name=code_name)
        token_dict[code_name] = (url, key)

    ## Get list of all record IDs in the database specified with the 'from_code_name' option
    from_url, from_key, from_code_name = api_settings.getApiCredentials(code_name=from_code_name)
    record_ids_all = getRecordIDList(from_url, from_key)

    ## Generate lists of fields & events which must be exported from each project in order to filter the record IDs. It would be fine to export all data, but this would take a long time.
    required_data_dict = {
        "ex_registry_only":{
            "ipss_v4":{
                "fields":["substud"],
                "events":["acute_arm_1"]
            }
        },
        "ex_unknown_stroke_type":{
            "ipss_v4":{
                "fields":["stroke_type"],
                "events":["acute_arm_1"]
            }
        },
        "ex_pre_2003":{
            "ipss_v4":{
                "fields":["dateentered"],
                "events":["acute_arm_1"]
            }
        },
        "ex_pre_2014":{
            "ipss_v4":{
                "fields":["originalipss"],
                "events":["acute_arm_1"]
            }
        },
        "ex_post_20191001":{
            "ipss_v4":{
                "fields":["dateentered"],
                "events":["acute_arm_1"]
            }
        },
        "ex_sk_patients":{
            "ipss_v4":{
                "fields":["ipssid"],
                "events":["acute_arm_1", "followup_arm_1"] # Need both in case record only has data in followup_arm_1
            }
        },
        "ex_neonatal_stroke":{
            "ipss_v4":{
                "fields":["strage"],
                "events":["acute_arm_1"]
            }
        },
        "ex_placeholders":{
            "ipss_v4":{
                "forms":["clinical_presentation", "cardiac_and_arteriopathy_risk_factors"], # note that here we need all fields in the form.
                "events":["acute_arm_1"]
            }
        },
        "ex_adult_stroke":{
            "ipss_v4":{
                "fields":["birmont", "biryear", "doe", "daent", "strage", "substud"],
                "events":["acute_arm_1"]
            }
        },
        "ex_melas":{
            "ipss_v4":{
                "fields":["genetsy", "genetsys"],
                "events":["acute_arm_1"]
            }
        },
        "ex_non_ipss":{
            "ipss_v4":{
                "fields":["ipssid"],
                "events":["acute_arm_1", "followup_arm_1"] # Need both in case record only has data in followup_arm_1
                }
        },
        "ex_non_sips":{
            "ipss_v4":{
                "fields":["substud"],
                "events":["acute_arm_1"]
            }
        },
        "ex_non_sips2":{
            "ipss_v4":{
                "fields":["substud"],
                "events":["acute_arm_1"]
            }
        },
        "ex_non_sips2_cohort1":{
            "ipss_v4":{
                "fields":["substud", "sip_cohort"],
                "events":["acute_arm_1"]
            }
        },
        "ex_non_sips2_cohort2":{
            "ipss_v4":{
                "fields":["substud", "sip_cohort"],
                "events":["acute_arm_1"]
            }
        },
        "ex_sips_exclusions":{
            "sips2_v2":{
                "fields":["screened"],
                "events":["confirmation_and_t_arm_1"]
            }
        },
        "ex_sips_exclusions_2":{
            "sips2_v2":{
                "fields":["screened", "actcohortsp"],
                "events":["confirmation_and_t_arm_1", "acute_arm_1"]
            }
        },
        "ex_patient_info_incomp":{
            "ipss_v4":{
                "fields":["patient_information_complete"],
                "events":["acute_arm_1"]
            }
        },
        "ex_core_incomplete":{
            "ipss_v4":{
                "fields":["patient_information_complete", "cardiac_and_arteriopathy_risk_factors_complete", "other_child_and_neonatal_risk_factors_complete", "clinical_presentation_complete", "status_at_discharge_complete"],
                "events":["acute_arm_1"]
            }
        },
        "ex_non_vips_enrolled":{
            "vips2":{
                "fields":["vscreen_sfoutc"],
                "events":["confirmation_and_t_arm_1", "confirmation_and_t_arm_2"]
            }
        },
        "ex_vips_screen_nonenroll":{
            "ipss_v4":{
                "fields":["vips_screened"],
                "events":["acute_arm_1"]
            }
        }
    }

    ## Build dicts for arguments to be passed to exportRecords.
    exportRecords_args = {} # keys are the code names of projects from which data will be export. Values are the args to be passed to exportRecords for those projects.
    for arg in exclusion_args:
        for code_name, project_data in required_data_dict[arg].iteritems():
            if (not code_name in exportRecords_args.keys()):
                exportRecords_args[code_name] = {"fields":None, "forms":None, "events":None}
            for key, val in project_data.iteritems():
                if (exportRecords_args[code_name][key] is None):
                    exportRecords_args[code_name][key] = val
                else:
                    exportRecords_args[code_name][key].extend(val)

    # Remove duplicates from the lists of args (not necessary, but do it for visual clarity).
    for code_name, args in exportRecords_args.iteritems():
        for arg, val in args.iteritems():
            if (not val is None):
                args[arg] = list(set(val))

    ## Export all data required for the specified filters. Don't export data from unneed projects.
    filter_data = {} # keys are project code names; values are the actual data sets exported from the projects.
    for code_name, args in exportRecords_args.iteritems():
        api_url = token_dict[code_name][0]
        api_key = token_dict[code_name][1]
        filter_data[code_name] = exportRecords(api_url, api_key, **args)


    #### Generate lists of IDs to exclude based on specified exclusions.
    ## Keep track of which records are excluded by each exclusion argument.
    excluded_ids = {} # key is exclusin arg; value is list of IDs excluded by that condition.

    ## SickKids Registry-only patients
    if ex_registry_only:
        # Registry-only records based on IPSS data
        # * All patients labelled registry-only in Archive are labelled likewise in IPSS.
        # * All '9203-' patients are labelled registry only in IPSS.
        excluded_ids["ex_registry_only"] = set()

        for row in filter_data['ipss_v4']:
            id = row['ipssid']
            if (row["substud___8"] == "1") or ('9203-' in id):
                excluded_ids["ex_registry_only"].add(id)


    ## Unknown stroke type
    if ex_unknown_stroke_type:
        record_ids_known_stroke_type_ipss = set()

        for row in filter_data['ipss_v4']:
            if (row["stroke_type___1"] == "1") or (row["stroke_type___2"] == "1") or (row["stroke_type___3"] == "1") or (row["stroke_type___4"] == "1") or (row["stroke_type___5"] == "1") or (row["stroke_type___6"] == "1") or (row["stroke_type___7"] == "1") or (row["stroke_type___8"] == "1") or (row['stroke_type___9'] == "1"):
                id = row["ipssid"]
                record_ids_known_stroke_type_ipss.add(id)
        excluded_ids["ex_unknown_stroke_type"] = set([id for id in record_ids_all if (not id in record_ids_known_stroke_type_ipss)])


    ## Patients that were entered before 2003
    if ex_pre_2003:
        excluded_ids["ex_pre_2003"] = set()

        for row in filter_data['ipss_v4']:
            id = row["ipssid"]
            date_entered = row["dateentered"].replace('-','') # convert 2019-01-23 to 20190123
            try:
                if (int(date_entered) < 20030101):
                    excluded_ids["ex_pre_2003"].add(id)
                else:
                    pass
            except ValueError: # occurs when value stored in 'dateentered' is blank (or possible another nonsense format)
                if (id[:2] == '7-'): # all '7-' patients are assume to be added after 2003.
                    continue
                elif (id[:5] == '9203-'): # all '9203-' patients are known to be entered before 2003.
                    excluded_ids["ex_pre_2003"].add(id)
                else: # If record has not dateentered, and is not a 7- or 9203- patient (who are known to be added before 20191001)
                    print "Warning: Assuming record '"+id+"' was added after 2003"


    ## Patients that have been entered since the launch of IPSS in REDCap from 2014 to present.
    if ex_pre_2014:
        excluded_ids["ex_pre_2014"] = set()

        for row in filter_data["ipss_v4"]:
            if (row["originalipss___1"] == '1'):
                id = row["ipssid"]
                excluded_ids["ex_pre_2014"].add(id)


    ## Patients that were entered after 2019-10-01
    if ex_post_20191001:
        excluded_ids["ex_post_20191001"] = set()
        for row in filter_data["ipss_v4"]:
            id = row["ipssid"]
            date_entered = row["dateentered"].replace('-','') # convert 2019-01-23 to 20190123
            try:
                if (int(date_entered) >= 20191001):
                    excluded_ids["ex_post_20191001"].add(id)
                else:
                    pass
            except ValueError: # occurs when value stored in 'dateentered' is blank (or possible another nonsense format)
                if (id[:2] == '7-') or (id[:5] == '9203-'):
                    continue
                else: # If record has not dateentered, and is not a 7- or 9203- patient (who are known to be added before 20191001)
                    excluded_ids["ex_post_20191001"].add(id)
                    print "Warning: Assuming record '"+id+"' was added after 20191001"


    ## SickKids patients (based on DAG)
    if ex_sk_patients:
        excluded_ids["ex_sk_patients"] = set()

        for row in filter_data["ipss_v4"]:
            if (row["redcap_data_access_group"] == "hsc"):
                id = row['ipssid']
                excluded_ids["ex_sk_patients"].add(id)


    ## Neonatal stroke based on IPSS data
    if ex_neonatal_stroke:
        excluded_ids["ex_neonatal_stroke"] = set()

        for row in filter_data["ipss_v4"]:
            if (row['strage'] == '0'):
                id = row['ipssid']
                excluded_ids["ex_neonatal_stroke"].add(id)


    ## Records deemed to be "placeholders" based on condition that they have no data in either the clinical presentation or cardiac risk factors forms.
    if ex_placeholders:
        record_ids_nonplaceholder_ipss = set()

        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (id in record_ids_nonplaceholder_ipss):
                continue
            row_has_data = False
            for field, value in row.iteritems():
                if (field in ['ipssid', 'redcap_data_access_group', 'redcap_event_name', 'redcap_repeat_instrument', 'redcap_repeat_instance', 'clinical_presentation_complete', 'cardiac_and_arteriopathy_risk_factors_complete']):
                    continue
                elif ('___' in field): # if it is a checkbox field
                    if (value == '1'): # if checkbox is checked
                        row_has_data = True
                        break
                else:
                    if (value != ''): # if field has data
                        row_has_data = True
                        break
            if row_has_data:
                record_ids_nonplaceholder_ipss.add(id)
        excluded_ids["ex_placeholders"] = [id for id in record_ids_all if (not id in record_ids_nonplaceholder_ipss)]


    ## Non-pediatric stroke (>= 19 years of age at date of stroke)
    if ex_adult_stroke:
        excluded_ids["ex_adult_stroke"] = set()

        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row['birmont'] != '') and (row['biryear'] != '') and (row['doe'] != ''):
                stroke_age = float(row['doe'][:4]) + float(row['doe'][5:7])/12 - float(row['biryear']) - float(row['birmont'])/12
                if (stroke_age >= 19.0 + 1.0/12.0): # only year/month of birth is known, so pad cutoff by 1 month.
                    excluded_ids["ex_adult_stroke"].add(id)

            elif (row['birmont'] != '') and (row['biryear'] != '') and (row['daent'] != ''):
                stroke_age = float(row['daent'][:4]) + float(row['daent'][5:7])/12 - float(row['biryear']) - float(row['birmont'])/12
                if (stroke_age >= 19.0 + 1.0/12.0): # only year/month of birth is known, so pad cutoff by 1 month.
                    excluded_ids["ex_adult_stroke"].add(id)


    ## MELAS patients
    if ex_melas:
        excluded_ids["ex_melas"] = set()

        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            desc = row['genetsys'].lower()
            if (row['genetsy'] == '1') and (('melas' in desc) or ('mitochondrial encephalopathy' in desc) or ('lactic acidosis' in desc)):
                excluded_ids["ex_melas"].add(id)


    ## Patients not in IPSS database
    if ex_non_ipss:
        ipss_ids = set()
        for row in filter_data["ipss_v4"]:
            id = row["ipssid"]
            ipss_ids.add(id)
        excluded_ids["ex_non_ipss"] = [id for id in record_ids_all if (not id in ipss_ids)]


    ## Patients not enrolled in SIPS I or SIPS II
    if ex_non_sips:
        record_ids_sips_ipss = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row["substud___4"] == "1") or (row["substud___6"] == "1"):
                record_ids_sips_ipss.add(id)
        excluded_ids["ex_non_sips"] = [id for id in record_ids_all if (not id in record_ids_sips_ipss)]


    ## Patients not enrolled in SIPS II
    if ex_non_sips2:
        record_ids_sips2_ipss = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row["substud___4"] == "1"):
                record_ids_sips2_ipss.add(id)
        excluded_ids["ex_non_sips2"] = [id for id in record_ids_all if (not id in record_ids_sips2_ipss)]


    ## SIPS II cohort I patients
    if ex_non_sips2_cohort1:
        record_ids_sips2_cohort1_ipss = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row["substud___4"] == "1") and (row['sip_cohort'] == '1'):
                record_ids_sips2_cohort1_ipss.add(id)
        excluded_ids["ex_non_sips2_cohort1"] = [id for id in record_ids_all if (not id in record_ids_sips2_cohort1_ipss)]


    ## SIPS II cohort II patients
    if ex_non_sips2_cohort2:
        record_ids_sips2_cohort2_ipss = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row["substud___4"] == "1") and (row['sip_cohort'] == '2'):
                record_ids_sips2_cohort2_ipss.add(id)
        excluded_ids["ex_non_sips2_cohort2"] = [id for id in record_ids_all if (not id in record_ids_sips2_cohort2_ipss)]


    ## Patients excluded from SIPS studies based on SIPS II variable 'screened'
    if ex_sips_exclusions:
        excluded_ids["ex_sips_exclusions"] = set()

        for row in filter_data["sips2_v2"]:
            id = row['ipssid']
            if (row["screened"] in ['1', '3']):
                excluded_ids["ex_sips_exclusions"].add(id)


    ## Patients excluded from SIPS studies based on SIPS II variables 'screened' and 'actcohortsp'
    if ex_sips_exclusions_2:
        excluded_ids["ex_sips_exclusions_2"] = set()

        actcohort2_ids = set()
        for row in filter_data["sips2_v2"]:
            id = row['ipssid']
            if (row["screened"] in ['1']):
                excluded_ids["ex_sips_exclusions_2"].add(id)
            if (row['actcohortsp'] == '2'):
                actcohort2_ids.add(id)
        for row in filter_data["sips2_v2"]:
            id = row['ipssid']
            if (id in actcohort2_ids) and (row['screened'] == '3'):
                excluded_ids["ex_sips_exclusions_2"].add(id)


    ## Patients for whom patient_information_complete != 2.
    if ex_patient_info_incomp:
        record_ids_patient_information_complete_ipss = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row['patient_information_complete'] == '2') or (row['redcap_data_access_group'] == 'slch'):
                record_ids_patient_information_complete_ipss.add(id)
        excluded_ids["ex_patient_info_incomp"] = [id for id in record_ids_all if (not id in record_ids_patient_information_complete_ipss)]


    ## Patients for whom any of the "core" forms are not marked as complete
    if ex_core_incomplete:
        excluded_ids["ex_core_incomplete"] = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row['patient_information_complete'] != '2') or (row['cardiac_and_arteriopathy_risk_factors_complete'] != '2') or (row['other_child_and_neonatal_risk_factors_complete'] != '2') or (row['clinical_presentation_complete'] != '2'):
                excluded_ids["ex_core_incomplete"].add(id)


    # Patients who are not enrolled in VIPS.
    if ex_non_vips_enrolled:
        record_ids_vips_enrolled = set()
        for row in filter_data["vips2"]:
            id = row['ipssid']
            if (row['vscreen_sfoutc'] == '4'):
                record_ids_vips_enrolled.add(id)
        excluded_ids["ex_non_vips_enrolled"] = [id for id in record_ids_all if (not id in record_ids_vips_enrolled)]


    # Patients who are "VIPS screened not enrolled" based on the IPSS field 'vips_screened'.
    if ex_vips_screen_nonenroll:
        excluded_ids["ex_vips_screen_nonenroll"] = set()
        for row in filter_data["ipss_v4"]:
            id = row['ipssid']
            if (row['vips_screened'] == '1'):
                excluded_ids["ex_vips_screen_nonenroll"].add(id)


    ## Remove all excluded IDs, and return the filtered list.
    record_ids = record_ids_all
    for exclusion, excluded_id_set in excluded_ids.iteritems():
        record_ids = [id for id in record_ids if (not id in excluded_id_set)]
    return record_ids
Пример #9
0
def getPatientInfo(url_arch, url_ipss, key_arch, key_ipss, enroll_date_min=2003, enroll_date_max=2020):
    # Create one list of record ID which are non-registry and have known stroke type.
    #record_ids = getIPSSIDs(ex_registry_only=True, ex_unknown_stroke_type=True, from_code_name="ipss_v3")
    #print "DEBUG: CHANGE getIPSSIDs arguments back to IPSS V4."
    record_ids = getIPSSIDs(ex_registry_only=True, ex_unknown_stroke_type=True)


    ## Create dict with patient information: {record_id: {dag:"...", enroll_date:"...", ...} }
    patient_info = {}
    for record_id in record_ids: # add item (another dict) for each patient in the Archive
        patient_info[record_id] = {}

    ## Get enrolment date for each record.
    # Archive - Use 'dateofentry', then 'visit_date".
    dateofentry_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids, fields=["dateofentry"], events=["acute_arm_1"], validate=False)
    for row in dateofentry_arch:
        if (row["dateofentry"] == ""):
            pass
        else:
            if ("enroll_date" in patient_info[row["pk_patient_id"]]):
                print "This record was counted twice: "+str(row["pk_patient_id"])
                continue
            patient_info[row["pk_patient_id"]]["enroll_date"] = int(row["dateofentry"][:4])

    num_missing = len([id for id in record_ids if (not "enroll_date" in patient_info[id])])
   
    record_ids_leftover = [id for id in record_ids if (not "enroll_date" in patient_info[id])]
    visit_date_leftover = exportRecords(url_arch, key_arch, record_id_list=record_ids_leftover, fields=["visit_date"], events=["acute_arm_1"], validate=False)
    for row in visit_date_leftover:
        if (row["visit_date"] == ""):
            pass
        else:
            if ("enroll_date" in patient_info[row["pk_patient_id"]]):
                print "This record was counted twice: "+str(row["pk_patient_id"])
                continue
            patient_info[row["pk_patient_id"]]["enroll_date"] = int(row["visit_date"][:4]) 
    num_missing = len([id for id in record_ids if (not "enroll_date" in patient_info[id])])
   
    # IPSS - use 'dateentered' (works for all but 6 patients).
    record_ids_leftover = [id for id in record_ids if (not "enroll_date" in patient_info[id])]
    dateentered_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids_leftover, fields=["dateentered"], events=["acute_arm_1"], validate=False)
    for row in dateentered_ipss:
        if (row["dateentered"] == ""):
            pass
        else:
            if ("enroll_date" in patient_info[row["ipssid"]]):
                print "This record was counted twice: "+str(row["ipssid"])
                continue
            patient_info[row["ipssid"]]["enroll_date"] = int(row["dateentered"][:4])
    num_missing = len([id for id in record_ids if (not "enroll_date" in patient_info[id])])

    enroll_dates = set()
    for id, info in patient_info.iteritems():
        if ('enroll_date' in info):
            enroll_dates.add(info['enroll_date'])
            if (not info['enroll_date'] in range(enroll_date_min, enroll_date_max+1)):
                print "Record enroll date outside ["+str(enroll_date_min)+", "+str(enroll_date_max)+"]:", id
        else:
            print "Record with no enrolment date:", id
    
    ## Get DAG for each record:
    dags_arch = exportRecords(url_arch, key_arch, record_id_list=record_ids, fields=["pk_patient_id"], validate=False)
    dags_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["ipssid"], validate=False)
    for row in dags_arch:
        record_id = row["pk_patient_id"]
        dag = row["redcap_data_access_group"]
        patient_info[record_id]["dag"] = dag
    for row in dags_ipss:
        record_id = row["ipssid"]
        dag = row["redcap_data_access_group"]
        if (not "dag" in patient_info[record_id]) or (patient_info[record_id]["dag"] == ""): # add DAG from IPSS if not added already
            patient_info[record_id]["dag"] = dag # overwriting DAG for records in Archive should not be a problem.
    
    ## Get stroke type for each patient. # Need to decide how we want to break this down further.
    #stroke_type_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["chais", "chcsvt", "neoais", "neocsvt", "ppis", "ppcsvt", "pvi", "preart", "othcond"], events=["acute_arm_1"])
    stroke_type_ipss = exportRecords(url_ipss, key_ipss, record_id_list=record_ids, fields=["stroke_type"], events=["acute_arm_1"])

    # Set stroke types to unknown initially.
    for record_id in patient_info:
        patient_info[record_id]["stroke_type"] = {}
        patient_info[record_id]["stroke_type"]["neo_ais"] = "2"
        patient_info[record_id]["stroke_type"]["neo_csvt"] = "2"
        patient_info[record_id]["stroke_type"]["child_ais"] = "2"
        patient_info[record_id]["stroke_type"]["child_csvt"] = "2"
        patient_info[record_id]["stroke_type"]["pp_ais"] = "2"
        patient_info[record_id]["stroke_type"]["pp_csvt"] = "2"
        patient_info[record_id]["stroke_type"]["pp_vi"] = "2"
        patient_info[record_id]["stroke_type"]["art"] = "2"
        patient_info[record_id]["stroke_type"]["other"] = "2"

        #'chais___1':'stroke_type___1',
        #'chcsvt___1':'stroke_type___2',
        #'neoais___1':'stroke_type___3',
        #'neocsvt___1':'stroke_type___4',
        #'ppis___1':'stroke_type___5',
        #'ppcsvt___1':'stroke_type___6',
        #'pvi___1':'stroke_type___7',
        #'preart___1':'stroke_type___8',
        #'othcond___1':'stroke_type___9'
        
    for row in stroke_type_ipss: # 0 - no, 1 - yes, 2 - unknown
        record_id = row["ipssid"]
        # neonatal AIS
        patient_info[record_id]["stroke_type"]["neo_ais"] = row["stroke_type___3"]
        # neonatal CSVT
        patient_info[record_id]["stroke_type"]["neo_csvt"] = row["stroke_type___4"]
        # child AIS
        patient_info[record_id]["stroke_type"]["child_ais"] = row["stroke_type___1"]
        # child CSVT
        patient_info[record_id]["stroke_type"]["child_csvt"] = row["stroke_type___2"]
        # presumed perinatal AIS
        patient_info[record_id]["stroke_type"]["pp_ais"] = row["stroke_type___5"]
        # presumed perinatal CSVT
        patient_info[record_id]["stroke_type"]["pp_csvt"] = row["stroke_type___6"]
        # presumed perinatal VI
        patient_info[record_id]["stroke_type"]["pp_vi"] = row["stroke_type___7"]
        # arteriopathy
        patient_info[record_id]["stroke_type"]["art"] = row["stroke_type___8"]
        # other
        patient_info[record_id]["stroke_type"]["other"] = row["stroke_type___9"]

    # Look for patients without an identified stroke type.
    record_ids_with_unidentified_stroke_type = []
    for id, record in patient_info.iteritems():
        identified_type = False
        for stroke_type, value in record["stroke_type"].iteritems():
            if (value == "1"):
                identified_type = True
                break
        if (not identified_type):
            record_ids_with_unidentified_stroke_type.append(id)
    
    # Print some stats on the acquired patient information.
    num_no_year = 0
    num_no_dag = 0
    for record_id, record in patient_info.iteritems():
        if (record["dag"] == ""):
            num_no_dag += 1 
        if (not "enroll_date" in record):
            num_no_year += 1
    print "Number of duplicated record IDs:", len(record_ids) - len(set(record_ids))
    print "Number of unique record IDs:", len(set(record_ids))
    print "Number of record IDs in patient_info:", len(patient_info)
    print "Number of records with no DAG:", num_no_dag
    print "Number of records with no enrolment date:", num_no_year
    print "Number of records with unidentified stroke type:", len(record_ids_with_unidentified_stroke_type)    
    return patient_info