예제 #1
0
def load_rnaseq_data(disease_code, with_clinical=False, wait_time=30):
    # Fetch RNA data
    archive_path = prefetch_rnaseq_data(disease_code)

    result_dir = os.path.join(os.path.dirname(archive_path), disease_code,
                              'gene_expression')

    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
        # Unpack tar file
        archive = tarfile.open(archive_path)

        gene_quantification_files = set(
            el for el in archive if 'genes.normalized_results' in el.name)
        if len(gene_quantification_files.difference(
                os.listdir(result_dir))) > 0:
            archive.extractall(members=gene_quantification_files,
                               path=result_dir)
            archive.extract('FILE_SAMPLE_MAP.txt', path=result_dir)

    # Load map from samples to RNA files
    rna_file_sample_map = pd.read_csv(os.path.join(result_dir,
                                                   'FILE_SAMPLE_MAP.txt'),
                                      sep='\t')
    rna_file_sample_map_id_split = rna_file_sample_map[
        'barcode(s)'].str.rsplit('-', n=4, expand=True)
    rna_file_sample_map_id_split.columns = [
        'TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID'
    ]

    rna_file_sample_map = rna_file_sample_map.join(
        rna_file_sample_map_id_split)

    gene_filter = rna_file_sample_map['filename'].str.contains(
        'genes.normalized_results')
    gene_rna_file_sample_map = rna_file_sample_map[gene_filter]

    rna_dfs = []
    for (f, sample) in list(
            zip(gene_rna_file_sample_map['filename'],
                gene_rna_file_sample_map['TCGA_ID'])):
        sample_rna_df = pd.read_csv(os.path.join(result_dir, f), sep='\t')
        sample_rna_df['TCGA_ID'] = sample

        sample_rna_df['gene_name'] = sample_rna_df.gene_id.str.split(
            '|').str.get(0)
        rna_dfs.append(sample_rna_df)

    rna_df = pd.concat(rna_dfs, copy=False).merge(rna_file_sample_map)

    if with_clinical:
        patient_data_df = load_clinical_data(disease_code)
        merged = rna_df.merge(patient_data_df,
                              how='outer',
                              left_on='TCGA_ID',
                              right_on='bcr_patient_barcode')

        return merged
    else:
        return rna_df
예제 #2
0
파일: tcga_rna.py 프로젝트: arahuja/pytcga
def load_rnaseq_data(disease_code,
                     with_clinical=False,
                     wait_time=30):
    # Fetch RNA data
    archive_path = prefetch_rnaseq_data(disease_code)

    result_dir = os.path.join(os.path.dirname(archive_path), disease_code, 'gene_expression')

    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
        # Unpack tar file
        archive = tarfile.open(archive_path)

        gene_quantification_files = set(el for el in archive if 'genes.normalized_results' in el.name)
        if len(gene_quantification_files.difference(os.listdir(result_dir))) > 0:
            archive.extractall(members=gene_quantification_files, path=result_dir)
            archive.extract('FILE_SAMPLE_MAP.txt', path=result_dir)

    # Load map from samples to RNA files
    rna_file_sample_map = pd.read_csv(os.path.join(result_dir, 'FILE_SAMPLE_MAP.txt'), sep='\t')
    rna_file_sample_map_id_split = rna_file_sample_map['barcode(s)'].str.rsplit('-', n=4, expand=True)
    rna_file_sample_map_id_split.columns = ['TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID']

    rna_file_sample_map = rna_file_sample_map.join(rna_file_sample_map_id_split)

    gene_filter = rna_file_sample_map['filename'].str.contains('genes.normalized_results')
    gene_rna_file_sample_map = rna_file_sample_map[gene_filter]

    rna_dfs = []
    for (f, sample) in list(zip(gene_rna_file_sample_map['filename'], gene_rna_file_sample_map['TCGA_ID'])):
        sample_rna_df = pd.read_csv(os.path.join(result_dir, f), sep='\t')
        sample_rna_df['TCGA_ID'] = sample

        sample_rna_df['gene_name'] = sample_rna_df.gene_id.str.split('|').str.get(0)
        rna_dfs.append(sample_rna_df)

    rna_df = pd.concat(rna_dfs, copy=False).merge(rna_file_sample_map)

    if with_clinical:
        patient_data_df = load_clinical_data(disease_code)
        merged = rna_df.merge(patient_data_df,
                              how='outer',
                              left_on='TCGA_ID', 
                              right_on='bcr_patient_barcode')

        return merged
    else: 
        return rna_df
예제 #3
0
def load_mutation_data(disease_code,
                       with_clinical=False,
                       variant_type='all',
                       wait_time=30):
    """Load variants from TCGA

    Parameters
    ----------
    disease_code : str

    with_clinical : bool, optional
        If True, attach the clinical information
    variant_type : str, optional
        Filter to a specific variant type 'SNP', 'INDEL'
    wait_time : int, optional
        Time to wait for response from TCGA

    Returns
    -------
    mutations : Pandas dataframe
        A dataframe of mutations
    """
    archive_path = prefetch_mutation_data(disease_code,
                                          wait_time=wait_time,
                                          cache=True)

    # Unpack tar file
    archive = tarfile.open(archive_path)

    result_dir = os.path.join(os.path.dirname(archive_path), disease_code,
                              'mutations')
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    archive.extractall(path=result_dir)

    # Filter to MAF files
    maf_files = [f for f in os.listdir(result_dir) if f.endswith('.maf')]

    mutation_df = pd.concat([
        pd.read_csv(os.path.join(result_dir, maf_file),
                    sep='\t',
                    na_values='[Not Available]') for maf_file in maf_files
    ],
                            copy=False)

    # Expand out the TCGA barcode to retrieve the TCGA ID
    tcga_info = mutation_df['Tumor_Sample_Barcode'].str.rsplit('-',
                                                               n=4,
                                                               expand=True)
    tcga_info.columns = [
        'TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID'
    ]

    mutations = mutation_df.join(tcga_info, how='left')

    if variant_type != 'all':
        if variant_type == 'indel':
            mutations = mutations[(mutations['Variant_Type'] == 'INS') |
                                  (mutations['Variant_Type'] == 'DEL')]
        else:
            mutations = mutations[mutations['Variant_Type'] == variant_type]

    logging.info("Loaded {} mutations for {} tumors from {} patients".format(
        len(mutations), mutations['Tumor_Sample_Barcode'].nunique(),
        mutations['TCGA_ID'].nunique()))

    if with_clinical:
        patient_data_df = load_clinical_data(disease_code)
        merged = mutations.merge(patient_data_df,
                                 how='outer',
                                 left_on='TCGA_ID',
                                 right_on='bcr_patient_barcode')

        logging.info("Patients: {}, Tumor Samples: {}, Mutations {}".format(
            merged['bcr_patient_barcode'].nunique(),
            merged['Tumor_Sample_Barcode'].nunique(),
            len(merged[~merged['TCGA_ID'].isnull()])))
        return merged
    else:
        return mutations
예제 #4
0
def load_mutation_data(disease_code,
                       with_clinical=False,
                       variant_type='all',
                       wait_time=30):
    """Load variants from TCGA
    
    Parameters
    ----------
    disease_code : str
        
    with_clinical : bool, optional
        If True, attach the clinical information
    variant_type : str, optional
        Filter to a specific variant type 'SNP', 'INDEL'
    wait_time : int, optional
        Time to wait for response from TCGA

    Returns
    -------
    mutations : Pandas dataframe
        A dataframe of mutations
    """
    archive_path = prefetch_mutation_data(disease_code,
                                      wait_time=wait_time,
                                      cache=True)

    # Unpack tar file
    archive = tarfile.open(archive_path)

    result_dir = os.path.join(os.path.dirname(archive_path), disease_code, 'mutations')
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    archive.extractall(path=result_dir)

    # Filter to MAF files
    maf_files = [f 
                  for f in os.listdir(result_dir) 
                  if f.endswith('.maf')]

    mutation_df = pd.concat([pd.read_csv(os.path.join(result_dir, maf_file),
                                    sep='\t', 
                                    na_values='[Not Available]') 
                    for maf_file in maf_files], copy=False)

    # Expand out the TCGA barcode to retrieve the TCGA ID
    tcga_info = mutation_df['Tumor_Sample_Barcode'].str.rsplit('-', n=4, expand=True)
    tcga_info.columns = ['TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID']

    mutations = mutation_df.join(tcga_info, how='left')

    if variant_type != 'all':
        if variant_type == 'indel':
            mutations = mutations[
                            (mutations['Variant_Type'] == 'INS') |
                            (mutations['Variant_Type'] == 'DEL')
                        ]          
        else:     
            mutations = mutations[mutations['Variant_Type'] == variant_type]

    logging.info("Loaded {} mutations for {} tumors from {} patients".format(
                    len(mutations),
                    mutations['Tumor_Sample_Barcode'].nunique(),
                    mutations['TCGA_ID'].nunique()
                )
    )

    if with_clinical:
        patient_data_df = load_clinical_data(disease_code)
        merged = mutations.merge(patient_data_df,
                            how='outer',
                            left_on='TCGA_ID', 
                            right_on='bcr_patient_barcode')

        logging.info("Patients: {}, Tumor Samples: {}, Mutations {}".format(
                    merged['bcr_patient_barcode'].nunique(),
                    merged['Tumor_Sample_Barcode'].nunique(),
                    len(merged[~merged['TCGA_ID'].isnull()])
                )
        )
        return merged
    else: 
        return mutations