示例#1
0
def load_taneja_data(data_dir):
    data = init_study_data()
    study_key = 'Taneja_RA_Normal_Relatives'
    data['metadata'] = STUDY_METADATA[study_key]
    csv_fn = data_dir + PATIENT_DATA_URLS[study_key]['dest_filename']
    data['attributes'] = file_utils.csv_file_column_names(csv_fn)

    sample_mappings = load_mappings(data_dir)
    sample_key_col = 'SampleID'
    raw_patients_data = file_utils.csv_file_to_nested_dict(csv_fn, sample_key_col)

    for raw_sample_key in raw_patients_data:
        raw_patient_data = raw_patients_data[raw_sample_key]
        #the spreadsheet has sample_ids that start with MWR and HLTY that do
        #not have entries in the sample_mappings and therefore(?) dont't have
        #any OTU data in the biom file to associate with.
        #the rows that have a SampleId that starts with 'V' are the good ones
        if raw_sample_key[0] == 'V':
            study_sample_key =  's' + (str(raw_sample_key)).lower()
            tornado_sample_key = sample_mappings[study_key][study_sample_key]

            patient_data = dict()
            patient_data['study_sample_key'] = study_sample_key
            patient_data['tornado_sample_key'] = tornado_sample_key

            relation_id = raw_patient_data['Relation']
            if relation_id == raw_sample_key:
                patient_data['is_patient'] = True
                patient_data['is_family'] = False
            else:
                patient_data['is_patient'] = False
                patient_data['is_family'] = True 

            data['patients'][tornado_sample_key] = patient_data
    return data
示例#2
0
def load_crc_adenoma_data(data_dir):
    data = init_study_data()
    study_key = 'CRC_adenoma'
    data['metadata'] = STUDY_METADATA[study_key]
    csv_fn = data_dir + PATIENT_DATA_URLS[study_key]['dest_filename']
    data['attributes'] = file_utils.csv_file_column_names(csv_fn)

    sample_mappings = load_mappings(data_dir)
    sample_key_col = 'SampleID'
    raw_patients_data = file_utils.csv_file_to_nested_dict(csv_fn, sample_key_col)

    for raw_sample_key in raw_patients_data:
        study_sample_key = 's' + (str(raw_sample_key)).lower()
        raw_patient_data = raw_patients_data[raw_sample_key]
        tornado_sample_key = sample_mappings[study_key][study_sample_key]

        patient_data = dict()
        patient_data['study_sample_key'] = study_sample_key
        patient_data['tornado_sample_key'] = tornado_sample_key

        diagnosis = raw_patient_data['Diagnosis'].strip()
        if diagnosis in ['cancer', 'adenoma', 'normal']:
            patient_data['Diagnosis'] = diagnosis
        else:
            raise Exception("Unexpected value in CRC Diagnosis Column: " + str(diagnosis))

        polyps = raw_patient_data['POLYP.1=Y']
        if polyps in ['1', '0']:
            patient_data['POLYP.1=Y'] = polyps
        else:
            raise Exception("Unexpected value in CRC POLYP.1=Y Column: " + str(polyps))

        data['patients'][tornado_sample_key] = patient_data
    return data
示例#3
0
def load_cdiff_data(data_dir):
    data = init_study_data()
    study_key = 'Cdiff'
    data['metadata'] = STUDY_METADATA[study_key]
    csv_fn = data_dir + PATIENT_DATA_URLS[study_key]['dest_filename']
    data['attributes'] = file_utils.csv_file_column_names(csv_fn)

    sample_mappings = load_mappings(data_dir)
    sample_key_col = 'SampleID'
    raw_patients_data = file_utils.csv_file_to_nested_dict(csv_fn, sample_key_col)

    for raw_sample_key in raw_patients_data:
        study_sample_key = 'ssx' + (str(raw_sample_key)).lower()
        raw_patient_data = raw_patients_data[raw_sample_key]
        tornado_sample_key = sample_mappings[study_key][study_sample_key]

        patient_data = dict()
        patient_data['study_sample_key'] = study_sample_key
        patient_data['tornado_sample_key'] = tornado_sample_key

        cdiff_slug = raw_patient_data['Description'].strip()
        if cdiff_slug == 'Normal':
            cdiff_clean = 'normal'
        elif cdiff_slug in ['C-diff +', 'C-diff+']:
            cdiff_clean = 'positive'
        elif cdiff_slug in ['C-diff -', 'C-diff-'] :
            cdiff_clean = 'negative'
        else:
            raise Exception("Unknown cdiff Description: " + str(cdiff_slug))
        patient_data['cdiff_level'] = cdiff_clean
        
        data['patients'][tornado_sample_key] = patient_data
    return data
示例#4
0
def load_mappings(data_dir):
    """
    nested dict of {study_key: {study_sample_key: tornado_sample_key}}
    """
    fn = data_dir + MAPPINGS_FILE_DEST
    raw_mappings = file_utils.csv_file_to_nested_dict(fn, 'SampleID')
    clean_mappings = dict()
    for raw_sample_key in raw_mappings:
        raw_atts = raw_mappings[raw_sample_key]
        study_sample_key = raw_sample_key.lower()
        #tornado_sample_key = raw_atts['FileName']
        tornado_sample_key = raw_sample_key
        study_key = raw_atts['Source_mapping_file']
        if not study_key in clean_mappings:
            clean_mappings[study_key] = dict()
        clean_mappings[study_key][study_sample_key] = tornado_sample_key

    return clean_mappings
示例#5
0
def construct_otu_metadata(taxa_filename, otu_ids):
    id_field = 'label'
    taxonomies = file_utils.csv_file_to_nested_dict(taxa_filename, id_field)
    for label in taxonomies:
        taxonomies[label]['species'] = 'unclassified'

    #the needed format for the biom format is like 'k__Bacteria',
    #except unclassifed is just 'unclassified' regardless of level
    #see biom_etl.extract_taxonomy_of_otu
    metadata = list()
    for otu_id in otu_ids:
        formatted_tax = list()
        for taxa_level in otus.TAXONOMY_LEVELS:
            taxa_name = taxonomies[otu_id][taxa_level]
            if taxa_name == 'unclassifed':
                taxa_formatted = taxa_name
            else:
                taxa_formatted = taxa_level[0] + '__' + taxa_name
            formatted_tax.append(taxa_formatted)
        otu_md = dict()
        otu_md['taxonomy'] = formatted_tax
        metadata.append(otu_md)
    return metadata
示例#6
0
def load_lambert_vaccine_data(data_dir):
    data = init_study_data()
    study_key = 'Lambert_Nathaniel'
    data['metadata'] = STUDY_METADATA[study_key]
    csv_fn = data_dir + PATIENT_DATA_URLS[study_key]['dest_filename']
    data['attributes'] = file_utils.csv_file_column_names(csv_fn)

    sample_mappings = load_mappings(data_dir)
    sample_key_col = 'SampleID'
    raw_patients_data = file_utils.csv_file_to_nested_dict(csv_fn, sample_key_col)

    for raw_sample_key in raw_patients_data:
        study_sample_key =  's' + (str(raw_sample_key)).lower()
        raw_patient_data = raw_patients_data[raw_sample_key]
        tornado_sample_key = sample_mappings[study_key][study_sample_key]

        patient_data = dict()
        patient_data['study_sample_key'] = study_sample_key
        patient_data['tornado_sample_key'] = tornado_sample_key

        days_since_vaccine_slug = raw_patient_data['Day'].strip()
        patient_data['days_since_vaccine'] = days_since_vaccine_slug
        data['patients'][tornado_sample_key] = patient_data
    return data
示例#7
0
def write_and_read_a_csv_file(jcx):

    #when reading/writing csvs in python, the table is treated as a list of
    #rows, and every row is a dict of key-value pairs, where the key is a
    #column name and the value is the value of the table's cell. The columns
    #are unordered until you actually write the table to a file with
    #a specified column ordered.
    #Here, I am making a table with 2 columns named 'id' and 'color'
    #that looks like:
    #
    # id,color
    # 0,red
    # 1,blue
    # 2,green

    list_of_rows = list()
    #make the rows with just an 'id' column
    for row_idx in range(0, 3):
        data_row = dict()
        data_row['id'] = row_idx
        list_of_rows.append(data_row)
    #add another 'column' called 'color' to the three rows
    list_of_rows[0]['color'] = 'red'
    list_of_rows[1]['color'] = 'blue'
    list_of_rows[2]['color'] = 'green'

    #make a filename for our csv
    tmp_relative_fn = 'color_table.csv'

    #wix provides access to two data directories through the JobContext:
    # - 'get_run_local_data_dir' is a directory that only this run of the job
    #     can access. It's intended for results files.
    # - 'get_job_global_data_dir' is a directory that each run of the same type
    #     of job can access. It's intended for raw data that will be
    #     processed on multiple runs and results that are cached between
    #     runs (so they aren't recomputed each run)
    data_dir = jcx.get_run_local_data_dir()

    #the file goes in the working directory that wix gave us
    tmp_absolute_fn = os.path.join(data_dir, tmp_relative_fn)

    #the order we want the columns to be in the csv file
    column_names = ['id', 'color']

    #use our file_utils to write the csv file
    #unfortunately, you really need to use my file_utils or things can go
    #wrong involving docker and file permissions
    jcx_user = jcx.get_container_user()
    file_utils.dump_csv(tmp_absolute_fn,
                        column_names,
                        list_of_rows,
                        file_owner=jcx_user,
                        ensure_dir=True)

    #now read it back in, completing the roundtrip. this method actually
    #returns a dict where the key is the 'id' row, and the value should be
    #identical to the row we wrote out
    roundtrip_rows = file_utils.csv_file_to_nested_dict(tmp_absolute_fn, 'id')

    #note that everything comes back as strings when you read them back in,
    #which is why the 'id' that is read back in is not an int
    jcx.log("The color of row '1' before and after roundtrip: " +
            list_of_rows[1]['color'] + ", " + roundtrip_rows['1']['color'])

    return
示例#8
0
def load_fst_results_from_csv(comparison_key, data_dir):
    fn = filename_of_fst_results(comparison_key, data_dir)
    #corresponds to the tornado_observation_key, which is just a number x for OTU-x
    key_col = 'index'
    results_by_otu_num = file_utils.csv_file_to_nested_dict(fn, key_col)
    return results_by_otu_num