def test_sample_names_period(self):
        """ Test the sample names function with period included in name"""

        files = ["s1.1.fastq", "s2.1.fastq"]

        self.assertEqual(utilities.sample_names(files, "fastq"),
                         ["s1.1", "s2.1"])
    def test_sample_names_pair_identifier(self):
        """ Test the sample names function with a pair identifier """

        files = ["s1.R1.fastq", "s2.R1.fastq"]

        self.assertEqual(utilities.sample_names(files, ".fastq", ".R1"),
                         ["s1", "s2"])
    def test_sample_names_pair_identifier_duplicate(self):
        """ Test the sample names function with a pair identifier included in the sample name"""

        files = ["s_1_1.fastq.gz", "s1_1.fastq.gz"]

        self.assertEqual(utilities.sample_names(files, ".fastq.gz", "_1"),
                         ["s_1", "s1"])
    def test_sample_names_pair_identifier_not_found(self):
        """ Test the sample names function with a pair identifier that is not included in the names """

        files = ["s1.R1.fastq", "s2.R1.fastq"]

        self.assertEqual(utilities.sample_names(files, ".fastq", "_R1"),
                         ["s1.R1", "s2.R1"])
Exemplo n.º 5
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='MVX')
    knead_human_genome_db = conf.get('databases').get('knead_dna')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('MVX', {}).get('input'):
        input_files = data_files.get('MVX').get('input')
        input_file_ext = data_files.get('MVX').get('input_file_extension')
        pair_identifier = data_files.get('MVX').get('pair_identifier')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'MVX')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)

        mvx_qc_output = shotgun.quality_control(
            workflow,
            input_files,
            project_dirs[1],
            args.threads, [knead_human_genome_db],
            pair_identifier=pair_identifier,
            remove_intermediate_output=True)

        paired_fastq_files = deinterleave_fastq(workflow, input_files,
                                                project_dirs[1])

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_1,
                                       input_file_ext,
                                       pair_identifier=pair_identifier)
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2],
                                         compress=False)
            paired_fastq_tars.append(paired_fastq_tar)

    workflow.go()
Exemplo n.º 6
0
def batch_convert_tsv_to_biom(workflow, tsv_files):
    """Batch converts tsv files to the biom format. BIOM files will be 
    deposited in the same folder as source TSV files and will carry the 
    same filenames.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        tsv_files (list): A list containing all TSV files to be converted 
            to BIOM format.
    
    Requires:
        Biom v2: A tool for general use formatting of biological data.

    Returns: 
        list: A list containing paths to all converted BIOM files.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import common

        workflow = anadama2.Workflow()

        tsv_files = ['/tmp/foo.tsv', '/tmp/bar.tsv', '/tmp/baz.tsv']
        biom_files = common.batch_convert_tsv_to_biom(workflow, tsv_files)

        print biom_files
        ## ['/tmp/foo.biom', '/tmp/bar.biom', '/tmp/baz.biom']
    """
    biom_files = []

    tsv_fnames = bb_utils.sample_names(tsv_files, '.tsv')
    tsv_dir = os.path.dirname(tsv_files[0])

    biom_dir = os.path.join(tsv_dir, 'biom')
    bb_utils.create_folders(biom_dir)

    biom_files = [
        os.path.join(biom_dir, biom_fname) for biom_fname in
        bb_utils.name_files(tsv_fnames, biom_dir, extension='biom')
    ]

    for (tsv_file, biom_file) in zip(tsv_files, biom_files):
        convert_to_biom_from_tsv(workflow, tsv_file, biom_file)

    return biom_files
Exemplo n.º 7
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='HG')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('HG', {}).get('input'):
        input_files = data_files.get('HG').get('input')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'HG')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)
        fastq_files = bam_to_fastq(workflow,
                                   input_files,
                                   project_dirs[1],
                                   paired_end=True,
                                   threads=args.threads,
                                   compress=False)
        paired_fastq_files = paired_files(fastq_files, '_R1')

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_4, pair_identifier="_R1")
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2])
            paired_fastq_tars.append(paired_fastq_tar)

        md5sum_files = generate_md5_checksums(workflow, paired_fastq_tars)

        workflow.go()
Exemplo n.º 8
0
def generate_md5_checksums(workflow, files):
    """Generates MD5 checksums for the provided set of files. All checksums 
    are written to a file containing the same name as the input but with the 
    "md5" extension appended.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        files (list): A list of files to package together into a tarball.
        output_tarball (string): The desired output tarball file.

    Requires:
        None

    Returns:
        list: A list of the generated md5 checksum files.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import common

        workflow = anadama2.Workflow()

        files = ['/tmp/foo.txt', '/tmp/bar.txt']

        md5sum_files = common.generate_md5_checksums(workflow, files)
    """
    output_dir = os.path.dirname(files[0])
    checksum_files = bb_utils.name_files(bb_utils.sample_names(files),
                                         output_dir,
                                         extension=".md5")

    workflow.add_task_gridable('md5sum [depends[0]] > [targets[0]]',
                               depends=files,
                               targets=checksum_files)

    return checksum_files
Exemplo n.º 9
0
    def _generate_metadata_file(task):
        input_files = [seq_file.name for seq_file in task.depends[:-3]]
        studytrax_metadata = task.depends[-4].name
        broad_sample_sheet = task.depends[-3].name
        auxillary_metadata = task.depends[-1].name if task.depends[-1].name != '/dev/null' else None
        metadata_out_file = task.targets[0].name

        data_type_map = config.get('dtype_mapping')

        studytrax_df = pd.read_csv(studytrax_metadata)
        broad_sample_df = pd.read_csv(broad_sample_sheet, 
                                      na_values=['destroyed', 'missed'],
                                      parse_dates=['Actual Date of Receipt'])

        collection_dates_dict = m_utils.get_collection_dates(broad_sample_df)

        if pair_identifier:
            (input_pair1, input_pair2) = bb_utils.paired_files(input_files, pair_identifier)
            input_files = input_pair1 if input_pair1 else input_files

        sample_mapping = dict(zip(bb_utils.sample_names(input_files, pair_identifier),
                               map(get_sample_id_from_fname, input_files)))        
        sample_ids = [sid.replace(pair_identifier, '') for sid in sample_mapping.values()]

        sample_subset_df = broad_sample_df[(broad_sample_df['Parent Sample A'].isin(sample_ids)) |
                                           (broad_sample_df['Proteomics'].isin(sample_ids)) |
                                           (broad_sample_df['MbX'].isin(sample_ids)) |
                                           (broad_sample_df['Site/Sub/Coll']).isin(sample_ids)]        

        metadata_df = sample_subset_df.merge(studytrax_df,
                                            left_on='Parent Sample A',
                                            right_on='st_q4',
                                            how='left')

        ## We sometimes get a situation where our studytrax metadata is missing
        ## some of the proteomics sample ID's so we need to make sure we replicate
        ## them
        metadata_df.loc[metadata_df['st_q17'].isnull(), 'st_q17'] = metadata_df['Proteomics']
        metadata_df.loc[metadata_df['st_q11'].isnull(), 'st_q11'] = metadata_df['MbX']
        metadata_df['data_type'] = data_type_map.get(data_type)

        if proteomics_metadata:
            proteomics_df = m_utils.add_proteomics_metadata(sample_subset_df, 
                                                            proteomics_metadata,
                                                            sample_mapping)
            metadata_df = metadata_df.merge(proteomics_df,
                                            on='Parent Sample A',
                                            how='left')


        metadata_df['External ID'] = new_metadata_df.apply(generate_external_id, axis=1)

        metadata_df['Site/Sub/Coll ID'] = metadata_df['Site/Sub/Coll'].map(lambda sid: str(sid))
        metadata_df['Site'] = metadata_df['SiteName']
        metadata_df['Participant ID'] = metadata_df['Subject'].map(lambda subj: 'C' + str(subj))
        metadata_df['visit_num'] = metadata_df['Collection #']
        metadata_df['Research Project'] = config.get('research_project')
        metadata_df['Project'] = metadata_df.apply(m_utils.get_project_id, axis=1)
        metadata_df = generate_collection_statistics(metadata_df,
                                                     collection_dates_dict)
        metadata_df = metadata_df.drop(config.get('drop_cols'), axis=1, inplace=True)
 
        if auxillary_metadata:
            ## Auxillary metadata are columns that will be added into our
            ## existing metadata rows. 
            metadata_df = m_utils.add_auxiliary_metadata(metadata_df,auxillary_metadata)

        metadata_df.to_csv(metadata_out_file, index=False)
Exemplo n.º 10
0
def match_tax_profiles(mtx_fastqs,
                       mtx_ext,
                       mtx_col_id,
                       tax_profiles,
                       tax_col_id,
                       metadata_file,
                       tags=None,
                       tax_tag='_taxonomic_profile.tsv'):
    """Takes two sets of files and attempts to match them together based on 
    the supplied HMP2 metadata file. Test

    Args:
        files_a (list): The first set of files to match against.
        files_a_id (string): Column to search for filename/ID in.
        files_b (list): The second set of files to match against.
        files_b_id (string): Column to search for filename/ID in. 
        data_type (string): Data-type that files "B" are.
        metadata_file (string): Path to a tab-delimited look-up file that 
            could provide a way to match any files up.
        tags (list): Any tags that are attached to files that can be 
            stripped prior to matching.

    Requires:
        None

    Returns:
        list: A list of files in set a that match to files set b; in order
        list: A list of files in set b that match to files in set a; in order

    Example:
        from hmp2_workflows.utils import files

        files_a = ['sampleA.fastq', 'sampleC.fastq', 'sampleD.fastq']
        files_b = ['sampleC_tax.tsv', 'sampleA_tax.tsv', 'sampleD_tax.tsv']

        (matched_files_a, matched_files_b) = files.match_files(files_a, files_b)
        # matched_files_a
        # ['sampleA.fastq', 'sampleC.fastq', 'sampleD.fastq']
        # matched_files_b
        # ['sampleA_tax.tsv', 'sampleC_tax.tsv', 'sampleD_tax.tsv']
    """
    matching_mtx_fastq = []
    matching_tax_profiles = []

    # Making the assumption here that the sample ID we will use for lookup in
    # our metadata file is the filename once we remove the extension
    mtx_sample_names = bb_utils.sample_names(mtx_fastqs, mtx_ext)

    if tags:
        mtx_sample_names = [
            name.replace(tag, '') for tag in tags for name in mtx_sample_names
            if tag in name
        ]

    mtx_sample_map = dict(zip(mtx_sample_names, mtx_fastqs))

    tax_profiles_fnames = map(os.path.basename, tax_profiles)
    tax_profiles_map = dict(zip(tax_profiles_fnames, tax_profiles))

    metadata_df = pd.read_csv(metadata_file)
    metadata_df_subset = metadata_df[
        (metadata_df.data_type == "metatranscriptomics")
        & (metadata_df[mtx_col_id].isin(mtx_sample_names))]

    for (idx, row) in metadata_df_subset.iterrows():
        mtx_id = row.get(mtx_col_id)
        tax_profile_fname = row.get(tax_col_id) + tax_tag

        if tax_profile_fname in tax_profiles_fnames:
            matching_mtx_fastq.append(mtx_sample_map.get(mtx_id))
            matching_tax_profiles.append(
                tax_profiles_map.get(tax_profile_fname))

    return (matching_mtx_fastq, matching_tax_profiles)
    def test_sample_names(self):
        """ Test the sample names function """

        files = ["s1.fastq", "s2.fastq"]

        self.assertEqual(utilities.sample_names(files, ".fastq"), ["s1", "s2"])
    def test_sample_names_extension(self):
        """ Test the sample names function without leading period"""

        files = ["s1.fastq", "s2.fastq"]

        self.assertEqual(utilities.sample_names(files, "fastq"), ["s1", "s2"])
Exemplo n.º 13
0
workflow.add_argument("strain-profiling-options", desc="additional options when running the strain profiling step", default="")
workflow.add_argument("max-strains", desc="the max number of strains to profile", default=20, type=int)

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
# return an error if no files are found
input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True)

### STEP #1: Run taxonomic profiling on all of the filtered files ###
if not args.bypass_taxonomic_profiling:
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow,
        input_files,args.output,args.threads,args.input_extension)
elif:
    sample_names = utilities.sample_names(input_files,args.input_extension)
    tsv_profiles = utilities.name_files(sample_names, demultiplex_output_folder, tag="taxonomic_profile", extension="tsv")
    # check all of the expected profiles are found
    if len(tsv_profiles) != len(list(filter(os.path.isfile,tsv_profiles))):
        sys.exit("ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n"+"\n".join(tsv_profiles))
    # run taxonomic profile steps bypassing metaphlan2
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow,
        tsv_profiles,args.output,args.threads,"tsv",already_profiled=True)
    # look for the sam profiles
    taxonomy_sam_files = utilities.name_files(sample_names, demultiplex_output_folder, tag="bowtie2", extension="sam")
    # if they do not all exist, then bypass strain profiling if not already set
    if len(taxonomy_sam_files) != len(list(filter(os.path.isfile,taxonomy_sam_files))):
        print("Warning: Bypassing taxonomic profiling but not all taxonomy sam files are present in the input folder. Strain profiling will be bypassed. Expecting the following input files:\n"+"\n".join(taxonomy_sam_files))
        args.bypass_strain_profiling = True

### STEP #2: Run strain profiling
Exemplo n.º 14
0
    # if the input files are fasta, bypass quality control
    qc_output_files = demultiplexed_files

### STEP #2: Run taxonomic profiling on all of the filtered files ###
if not args.bypass_taxonomic_profiling:
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(
        workflow, qc_output_files, args.output, args.threads,
        args.input_extension)

elif not args.bypass_functional_profiling or not args.bypass_strain_profiling:
    # get the names of the taxonomic profiling files allowing for pairs
    input_pair1, input_pair2 = utilities.paired_files(demultiplexed_files,
                                                      original_extension,
                                                      args.pair_identifier)
    sample_names = utilities.sample_names(
        input_pair1 if input_pair1 else input_files, original_extension,
        args.pair_identifier)
    tsv_profiles = utilities.name_files(sample_names,
                                        demultiplex_output_folder,
                                        tag="taxonomic_profile",
                                        extension="tsv")
    # check all of the expected profiles are found
    if len(tsv_profiles) != len(list(filter(os.path.isfile, tsv_profiles))):
        sys.exit(
            "ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n"
            + "\n".join(tsv_profiles))
    # run taxonomic profile steps bypassing metaphlan2
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(
        workflow,
        tsv_profiles,
        args.output,
def get_metadata_rows(config, studytrax_df, sample_df, proteomics_df,
                      data_type, sequence_files, pair_identifier):
    """Extracts metadata from the supplied sources of metadata for the
    provided sequence files. 

    Args:
        config (dict): Configuration parameters for metadata
        studytrax_df (pandas.DataFrame): StudyTrax clinical metadata
        sample_df (pandas.DataFrame): Broad sample status metadata
        proteomics_df (pandas.DataFrame): Proteomics metadata.
        data_type (string): Data type of the provided sequence files
        sequence_files (list): A list of sequence files that metadata
            should be pulled for if available.
        pair_identifier (string): If working with paired-end files the 
            identifier to distinguish the first file from its pair.            

    Requires:
        None

    Returns:
        pandas.DataFrame: Slice of metadata for files provided.
    """
    metadata_df = None

    sample_mapping = dict(
        zip(bb_utils.sample_names(sequence_files, pair_identifier),
            map(get_sample_id_from_fname, sequence_files)))
    sample_ids = sample_mapping.values()

    if pair_identifier:
        sample_ids = [sid.replace(pair_identifier, '') for sid in sample_ids]

    sample_ids_techreps = [
        sid for (k, sid) in sample_mapping.iteritems() if "techrep" in k
    ]
    sample_ids = set(sample_ids) - set(sample_ids_techreps)

    data_type_mapping = config.get('dtype_mapping')

    ## Grab subset of Broad sample tracking spreadsheet
    sample_subset_df = pd.DataFrame()
    if data_type != "HTX":
        sample_subset_df = sample_df[
            (sample_df['Parent Sample A'].isin(sample_ids)) |
            (sample_df['Proteomics'].isin(sample_ids)) |
            (sample_df['MbX'].isin(sample_ids)) |
            (sample_df['Viromics'].isin(sample_ids)) |
            (sample_df['Site/Sub/Coll']).isin(sample_ids)]

    if sample_ids_techreps:
        sample_ids_techreps = [
            sample_id.replace('_techrep', '')
            for sample_id in sample_ids_techreps
        ]
        sample_subset_techreps = sample_df[
            (sample_df['Parent Sample A'].isin(sample_ids_techreps)) |
            (sample_df['Proteomics'].isin(sample_ids_techreps)) |
            (sample_df['MbX'].isin(sample_ids_techreps)) |
            (sample_df['Viromics'].isin(sample_ids_techreps)) |
            (sample_df['Site/Sub/Coll']).isin(sample_ids_techreps)]

        sample_subset_techreps['External ID'] = sample_subset_techreps[
            'Parent Sample A'].map(lambda sid: sid.replace('-', '') + "_TR")
        sample_subset_techreps['External ID'] = sample_subset_techreps.apply(
            lambda row: row.get('Site/Sub/Coll')[0] + row.get('External ID'),
            axis=1)
        sample_subset_df = pd.concat(
            [sample_subset_df, sample_subset_techreps], ignore_index=True)

    ## TODO: Figure out if we have any samples that did not have aassociated metadata
    #join_how = 'outer' if full_join else 'left'
    if len(sample_subset_df) == 0:
        other_loc_map = {
            '0': 'Terminal ileum',
            '1': 'Neo-ileum',
            '2': 'Ileocecal Valve',
            '3': 'Cecum',
            '4': 'Ascending (right-sided) colon',
            '5': 'Transverse colon',
            '6': 'Descending (left-sided) colon',
            '7': 'Sigmoid Colon',
            '8': 'Rectum'
        }

        if data_type == "HTX" or data_type == "RRBS":
            # TODO: Add these to config file
            biopsy_map = {
                'bx_q5': 'Rectum',
                'bx_q6': 'Ileum',
                'bx_q7': 'Other Inflamed',
                'bx_q9': 'Non-inflamed'
            }

            new_meta_dfs = []
            for (studytrax_col, location) in biopsy_map.iteritems():
                new_meta_df = studytrax_df[studytrax_df[studytrax_col].isin(
                    sample_ids)]
                new_meta_df['biopsy_location'] = location
                new_meta_df['External ID'] = new_meta_df[studytrax_col].map(
                    lambda sid: sid.replace('-', ''))

                if not new_meta_df['biopsy_location'].empty:
                    if location == "Other Inflamed":
                        new_loc_col = "bx_q8"
                    elif location == "Non-inflamed":
                        new_loc_col = "bx_q10"
                    if not location in ['Rectum', 'Ileum']:
                        new_meta_df['biopsy_location'] = [
                            other_loc_map.get(x)
                            for x in new_meta_df[new_loc_col]
                        ]

                new_meta_dfs.append(new_meta_df)

            if data_type == "RRBS":
                blood_df = studytrax_df[studytrax_df['bl_q4'].isin(sample_ids)]
                blood_df['External ID'] = blood_df['bl_q4'].map(
                    lambda sid: sid.replace('-', ''))
                new_meta_dfs.append(blood_df)

            metadata_df = pd.concat([sample_subset_df] + new_meta_dfs,
                                    ignore_index=True)
            metadata_df = metadata_df.drop_duplicates(
                subset=['External ID', 'biopsy_location'], keep='first')
            metadata_df['Site/Sub/Coll'] = metadata_df.apply(
                _get_non_stool_site_sub_coll, axis=1)
            resolve_dupe_ssc_ids(metadata_df)
        elif data_type == "HG":
            studytrax_col = "bl_q4"
            blood_df = studytrax_df[studytrax_df[studytrax_col].isin(
                sample_ids)]
            blood_df['External ID'] = blood_df[studytrax_col].map(
                lambda sid: sid.replace('-', ''))

            metadata_df = pd.concat([sample_subset_df, blood_df],
                                    ignore_index=True)

            metadata_df['Site/Sub/Coll'] = metadata_df.apply(
                _get_non_stool_site_sub_coll, axis=1)
            resolve_dupe_ssc_ids(metadata_df)
        elif data_type == "SER":
            new_metadata_dfs = []
            studytrax_cols = {'bl_q5': None}

            # We get some really weird stuff going on in the studytrax mapping column here so let's
            # clean things up first
            studytrax_df['bl_q5'] = studytrax_df['bl_q5'].map(
                _clean_blood_sample_ids)

            for (col, label) in studytrax_cols.iteritems():
                new_metadata_df = studytrax_df[studytrax_df[col].isin(
                    sample_ids)]
                new_metadata_df['External ID'] = new_metadata_df[col]
                new_metadata_dfs.append(new_metadata_df)

            metadata_df = pd.concat([sample_subset_df] + new_metadata_dfs,
                                    ignore_index=True)
            metadata_df['Site/Sub/Coll'] = metadata_df.apply(
                _get_non_stool_site_sub_coll, axis=1)
            resolve_dupe_ssc_ids(metadata_df)
        elif data_type == "16SBP":
            biopsy_map = {
                'bx_q13': 'Rectum',
                'bx_q14': 'Ileum',
                'bx_q15': 'Other Inflamed',
                'bx_q17': 'Non-inflamed'
            }

            biopsy_dfs = []
            for (studytrax_col, location) in biopsy_map.iteritems():
                biopsy_df = studytrax_df[studytrax_df[studytrax_col].isin(
                    sample_ids)]
                biopsy_df['biopsy_location'] = location
                biopsy_df['External ID'] = biopsy_df[studytrax_col].map(
                    lambda sid: sid.replace('-', ''))

                if not biopsy_df['biopsy_location'].empty:
                    if location == "Other Inflamed":
                        new_loc_col = "bx_q16"
                    elif location == "Non-inflamed":
                        new_loc_col = "bx_q18"

                    if not location in ['Rectum', 'Ileum']:
                        biopsy_df['biopsy_location'] = [
                            other_loc_map.get(x)
                            for x in biopsy_df[new_loc_col]
                        ]

                biopsy_dfs.append(biopsy_df)

            metadata_df = pd.concat([sample_subset_df] + biopsy_dfs,
                                    ignore_index=True)
            metadata_df.drop_duplicates(
                subset=['External ID', 'biopsy_location'], keep='first')
            metadata_df['Site/Sub/Coll'] = metadata_df.apply(
                _get_non_stool_site_sub_coll, axis=1)
            resolve_dupe_ssc_ids(metadata_df)

    else:
        metadata_df = sample_subset_df.merge(studytrax_df,
                                             left_on='Parent Sample A',
                                             right_on='st_q4',
                                             how='left')

        ## We sometimes get a situation where our studytrax metadata is missing
        ## some of the proteomics sample ID's so we need to make sure we
        ## replicate them.
        metadata_df.loc[metadata_df['st_q17'].isnull(),
                        'st_q17'] = metadata_df['Proteomics']
        metadata_df.loc[metadata_df['st_q11'].isnull(),
                        'st_q11'] = metadata_df['MbX']
        metadata_df.loc[metadata_df['st_q12'].isnull(),
                        'st_q12'] = metadata_df['Viromics']

        if proteomics_df is not None:
            ## In order to merge our proteomics data properly we'll need to
            ## first create a subset of our Broad sample tracking sheet
            ## that isolates just rows related to Proteomics data (the column
            ## Proteomics Status should be EXPORTED).
            sample_filter_df = sample_subset_df[
                sample_subset_df['Proteomics status'] == 'EXPORTED']
            sample_filter_df = sample_filter_df[[
                'Parent Sample A', 'Proteomics'
            ]]

            proteomics_df['sample_ids'] = proteomics_df['Dataset'].replace(
                sample_mapping)
            proteomics_df['PDO Number'] = proteomics_df['Dataset'].map(
                lambda did: did.replace('-', '_').split('_')[0])
            proteomics_df = sample_filter_df.merge(proteomics_df,
                                                   left_on='Proteomics',
                                                   right_on='sample_ids',
                                                   how='right')
            proteomics_df = proteomics_df.drop('Proteomics', 1)

            metadata_df = metadata_df.merge(proteomics_df,
                                            on='Parent Sample A',
                                            how='left')
            metadata_df['External ID'] = None

    ## Now if we have techreps in our samples we need to add them in.
    metadata_df['data_type'] = data_type_mapping.get(data_type)

    return metadata_df
Exemplo n.º 16
0
def main(workflow):
    args = workflow.parse_args()

    conf_mtx = parse_cfg_file(args.config_file, section='MTX')
    conf_mgx = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    adapters_file = manifest.get('adapters_file')

    contaminate_db = conf_mtx.get('databases').get('knead_dna')
    mtx_db = conf_mtx.get('databases').get('knead_mtx')
    rrna_db = conf_mtx.get('databases').get('knead_rrna')
    adapter_sequences = conf_mtx.get('adapter_sequences')

    qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
    tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
    func_threads = args.threads_humann if args.threads_humann else args.threads

    if data_files and data_files.get('MTX', {}).get('input'):
        input_files_mtx = data_files.get('MTX').get('input')
        file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq')
        pair_identifier_mtx = data_files.get('MTX').get('pair_identifier')
        input_file_tags = data_files.get('MTX').get('tags')
        input_tax_profiles = []

        project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'),
                                                conf_mtx.get('processing_dir'),
                                                conf_mtx.get('public_dir')],
                                               project,
                                               creation_date,
                                               'MTX')
        public_dir_mtx = project_dirs_mtx[-1]
        base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..'))

        manifest_file = stage_files(workflow, 
                                    [args.manifest_file],
                                    base_depo_dir)
        deposited_files_mtx = stage_files(workflow,
                                          input_files_mtx,
                                          project_dirs_mtx[0],
                                          symlink=True)

        if file_extension_mtx == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow, 
                                            deposited_files_mtx, 
                                            project_dirs_mtx[1],
                                            paired_end=True,
                                            compress=False,
                                            threads=args.threads)
            pair_identifier_mtx = "_R1"                                            
        else:
            paired_end_seqs = deposited_files_mtx

        if adapters_file:
            adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE "
                                 "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file)

        (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow,
                                                                paired_end_seqs,
                                                                file_extension_mtx,
                                                                project_dirs_mtx[1],
                                                                qc_threads,
                                                                databases=[contaminate_db,
                                                                           rrna_db,
                                                                           mtx_db],
                                                                pair_identifier=pair_identifier_mtx,
                                                                additional_options=adapter_trim_opts,
                                                                remove_intermediate_output=True)

        sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx)

        ##########################################
        #          MGX FILE PROCESSING           #
        ##########################################
        # Ideally we would be passed in a set of corresponding metagenome
        # sequence(s) to go with our metatranscriptomic files but we also
        # have two other scenarios:
        #
        #       1.) No accompanying metagenomic sequences exist; in this
        #           case we will proceed just using the metatranscriptomic
        #           data.
        #       2.) Taxonomic profiles are passed directly in in our MANIFEST
        #           file; here we remove these from our input files and
        #           prevent them from running through the kneaddata ->
        #           metaphlan2 portions of our pipeline
        if data_files.get('MGX', {}).get('input'):
            input_files_mgx = data_files.get('MGX').get('input')
            file_extension_mgx = data_files.get('MGX').get('file_ext')
            pair_identifier_mgx = data_files.get('MGX').get('pair_identifier')
            input_tax_profiles = [in_file for in_file in input_files_mgx
                                  if 'taxonomic_profile.tsv' in in_file]
            input_files_mgx = set(input_files_mgx) - set(input_tax_profiles)

            if input_files_mgx:
                sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx)

                project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'),
                                                        conf_mgx.get('processing_dir'),
                                                        conf_mgx.get('public_dir')],
                                                       project,
                                                       creation_date,
                                                       'WGS')
                public_dir_mgx = project_dirs_mgx[-1]

                deposited_files_mgx = stage_files(workflow,
                                                  input_files_mgx,
                                                  project_dirs_mgx[0],
                                                  symlink=True)

                if file_extension_mgx == ".bam":
                    ## Need to sort our BAM files to be sure here...
                    paired_end_seqs = bam_to_fastq(workflow, 
                                                    deposited_files_mgx, 
                                                    project_dirs_mgx[1],
                                                    paired_end=True,
                                                    compress=False,
                                                    threads=args.threads)
                    pair_identifier_mgx = "_R1"                                            
                else:
                    paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx)  

                (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow,
                                                                        paired_end_seqs_mgx,
                                                                        project_dirs_mgx[1],
                                                                        qc_threads,
                                                                        [contaminate_db,
                                                                        rrna_db],
                                                                        remove_intermediate_output=True)

                tax_outs_mgx = taxonomic_profile(workflow,
                                                 cleaned_fastqs_mgx,
                                                 project_dirs_mgx[1],
                                                 tax_threads,
                                                 '*.fastq')

                func_outs_mgx = functional_profile(workflow,
                                                   cleaned_fastqs_mgx,
                                                   project_dirs_mgx[1],
                                                   func_threads,
                                                   tax_outs_mgx[1],
                                                   remove_intermediate_output=True)
                input_tax_profiles.extend(tax_outs_mgx[1])

                pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw')
                pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile')
                pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile')
                map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir,
                                    pub_wgs_func_profile_dir])

                norm_genefamilies_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='genes',
                                                tag='genefamilies_relab',
                                                extension='tsv')
                norm_ecs_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='ecs',
                                                tag='genefamilies_ecs_relab',
                                                extension='tsv')
                norm_path_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='pathways',
                                                tag='pathabundance_relab',
                                                extension='tsv')

                pcl_files = add_metadata_to_tsv(workflow,
                                                [tax_outs_mgx[1]] 
                                                + func_outs_mgx,
                                                'metagenomics',
                                                conf_mgx.get('metadata_id_col'),
                                                conf_mgx.get('analysis_col_patterns'),
                                                conf_mgx.get('target_metadata_cols'))
                                      
                func_tar_files_wgs = []
                for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx,
                                                                    norm_genefamilies_mgx,
                                                                    norm_ecs_files_mgx,
                                                                    norm_path_files_mgx):
                    tar_path = os.path.join(pub_wgs_func_profile_dir, 
                                            "%s_humann2.tgz" % sample)
                    func_tar_file = tar_files(workflow,
                                            [gene_file, ecs_file, path_file],
                                            tar_path,
                                            depends=func_outs_mgx)
                    func_tar_files_wgs.append(func_tar_file)

        ##########################################
        #          MTX FILE PROCESSING           #
        ##########################################
        # Here we want to see if we can create a set of matching cleaned
        # MTX files to corresponding MGX taxonomic profiles. If these exist
        # we want to run functional profiling wit hthe corresponding MGX
        # taxonomic profile otherwise we will run a taxonomic profiling
        # on the MTX sequences and run functional profiling with the produced
        # taxonomic profile.
        func_outs_match_mtx = []
        if input_tax_profiles:
            (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx,
                                                                     '.fastq',
                                                                     data_files.get('MTX').get('metadata_id_col', 'External ID'),
                                                                     input_tax_profiles,
                                                                     data_files.get('MGX').get('tax_profile_id', 'External ID'),
                                                                     args.metadata_file,
                                                                     tags=input_file_tags)

            func_outs_match_mtx = functional_profile(workflow,
                                                     matched_fqs,
                                                     project_dirs_mtx[1],
                                                     func_threads,
                                                     matched_tax_profiles,
                                                     remove_intermediate_output=True)

            # Reset the remaining MTX files left over here so that we can run them through
            # the metaphlan2 -> humann2 pipeline.
            cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs)

        if cleaned_fastqs_mtx:
            tax_outs_mtx = taxonomic_profile(workflow,
                                             cleaned_fastqs_mtx,
                                             project_dirs_mtx[1],
                                             tax_threads,
                                             '*.fastq')
            func_outs_mtx = functional_profile(workflow,
                                               cleaned_fastqs_mtx,
                                               file_extension_mtx,
                                               project_dirs_mtx[1],
                                               func_threads,
                                               tax_outs_mtx[1],
                                               remove_intermediate_output=True)
            func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx)
        else:
            func_outs_mtx = func_outs_match_mtx

        # We'll need to generate DNA/RNA normalized files to be displayed 
        # in our visualization output.
        (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow,
                                                                        func_outs_mgx[0],
                                                                        func_outs_mgx[1],
                                                                        func_outs_mgx[2],
                                                                        func_outs_mtx[0],
                                                                        func_outs_mtx[1],
                                                                        func_outs_mtx[2],
                                                                        project_dirs_mtx[1])

        pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw')
        pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile')
        pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile')
        map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir,
                             pub_mtx_func_profile_dir])

        norm_genefamilies_mtx = name_files(sample_names_mtx,
                                           project_dirs_mtx[1],
                                           subfolder='genes',
                                           tag='genefamilies_relab',
                                           extension='tsv')
        norm_ecs_files_mtx = name_files(sample_names_mtx,
                                        project_dirs_mtx[1],
                                        subfolder='ecs',
                                        tag='genefamilies_ecs_relab',
                                        extension='tsv')
        norm_path_files_mtx = name_files(sample_names_mtx,
                                         project_dirs_mtx[1],
                                         subfolder='pathways',
                                         tag='pathabundance_relab',
                                         extension='tsv')

        func_tar_files_mtx = []
        for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx,
                                                            norm_genefamilies_mtx,
                                                            norm_ecs_files_mtx,
                                                            norm_path_files_mtx):
            tar_path = os.path.join(pub_mtx_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_outs_mtx)
            func_tar_files_mtx.append(func_tar_file)
    
        workflow.go()
    default="")
workflow.add_argument("dbcan-path",
                      desc="the path to the run_dbcan.py script",
                      default="/app/")

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
# return an error if no files are found
input_files = utilities.find_files(args.input,
                                   extension=args.input_extension,
                                   exit_if_not_found=True)

### STEP #1: Run quality control on all input files ###
sample_names = utilities.sample_names(input_files, args.input_extension)
input_pair1, input_pair2 = utilities.paired_files(input_files,
                                                  args.input_extension,
                                                  args.pair_identifier)
paired = False
if input_pair1:
    sample_names = utilities.sample_names(input_pair1, args.input_extension,
                                          args.pair_identifier)
    qc_targets = [
        utilities.name_files([
            name + ".trimmed.1.fastq", name + ".trimmed.2.fastq",
            name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq",
            name + ".trimmed.single.12.fastq"
        ],
                             args.output,
                             subfolder="kneaddata",
Exemplo n.º 18
0
def generate_sample_metadata(workflow, data_type, in_files, metadata_file, 
                             output_dir, id_column = 'External ID'):
    """Generates a series of individual metadata files in CSV format 
    from the provided merged metadata file. Each of the provided samples
    has a metadata file generated to accompany any product files generated 
    by the analysis pipelines.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        data_type (string): The data type of the provided samples. One of 
            either 'metageonimcs', 'proteomices', 'amplicon'.
        in_files (list): A list of files that should have corresponding 
            metadata files written.
        metadata_file (string): Path to the merged metadata file.
        id_column (string): The ID column to attempt to map sample names to 
            in the merged metadata file. Default set to "External ID" but 
            can change depending on the data type.
        output_dir (string): Path to output directory to write each 
            sample metadata file too.

    Requires:
        None

    Returns:
        list: A list containing the path to all sample metadata files created.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import metadata

        workflow = anadama2.Workflow()
        
        samples = ['sampleA', 'sampleB']
        metadadta_file = '/tmp/merged_metadata.csv'
        output_dir = '/tmp/metadata'

        metadata_files = metadata.generate_sample_metadata(workflow, 
                                                           'metagenomics',
                                                           samples, 
                                                           metadata_file, 
                                                           output_dir)
        print metadata_files
        ## ['/tmp/metadata/sampleA.csv', '/tmp/metadata/sampleB.csv']
    """
    metadata_df = pd.read_csv(metadata_file)
    samples = bb_utils.sample_names(in_files)

    output_metadata_files = bb_utils.name_files(samples, 
                                                output_dir, 
                                                extension = 'csv',
                                                subfolder = 'metadata',
                                                create_folder = True)
    sample_metadata_dict = dict(zip(samples, output_metadata_files))

    def _workflow_gen_metadata(task):
        metadata_subset = metadata_df.loc[(metadata_df[id_column].isin(samples)) &
                                          (metadata_df['data_type'] == data_type)]
    
        if metadata_subset.empty:
            raise ValueError('Could not find metadata associated with samples.',
                             ",".join(samples))

        for (sample_id, row) in metadata_subset.iterrows():
            sample_metadata_file = sample_metadata_dict.get(row[id_column])
            metadata_subset.xs(sample_id).to_csv(sample_metadata_file, index=False)
    
    workflow.add_task(_workflow_gen_metadata,
                      targets=output_metadata_files,  
                      depends=in_files + [metadata_file],
                      name='Generate sample metadata')

    return sample_metadata_dict.values()
Exemplo n.º 19
0
def bam_to_fastq(workflow,
                 input_files,
                 output_dir,
                 paired_end=False,
                 compress=True,
                 threads=1):
    """Converts BAM sequence files to a single interleaved FASTQ file using
    the samtools bam2fq utility.

    Args:
        workflow (anadama2.Workflow): The AnADAMA2 Workflow object to append 
            the BAM to FASTQ conversion step to.
        input_files (list): A list containing all BAM files to be converted.
        output_dir (string): The output directory to write converted files too.
        paired_end (bool): If True generated paired end files.
        compress (bool): Compress fastq files generated by samtools. 
        threads (int): The number of threads/cores to use for BAM -> FASTQ 
            conversion.

    Requires:
        bedtools 2.17+

    Returns:
        list: A list of the newly-converted FASTQ files.

    Example:
        from anadama2 import Workflow

        from hmp2_workflows.tasks.file_conv import bam_to_fastq

 
        workflow = Workflow()
        fastq_files = bam_to_fastq(workflow,
                                   ['/tmp/fooA.bam', '/tmp/fooB.bam'],
                                   '/seq/ibdmdbd/out_dir'])
    """
    sample_names = bb_utils.sample_names(input_files, '.bam')
    sorted_bams = bb_utils.name_files(sample_names,
                                      output_dir,
                                      subfolder="sort",
                                      tag="sorted",
                                      extension="bam",
                                      create_folder=True)

    ## Gotta make sure our BAM file is sorted first
    workflow.add_task_group_gridable(
        'sambamba sort -n -t [args[0]] -m 4GB -o [targets[0]] [depends[0]]',
        depends=input_files,
        targets=[os.path.splitext(bam)[0] for bam in sorted_bams],
        args=[threads],
        time=30 * 60,
        cores=threads,
        mem=4098)

    reformat_cmd = (
        "reformat.sh t=[args[0]] in=[depends[0]] out=stdout.fq primaryonly | "
        "reformat.sh t=[args[0]] in=stdin.fq out1=[targets[0]] ")
    if paired_end:
        mate_1_files = bb_utils.name_files(map(os.path.basename, input_files),
                                           output_dir,
                                           tag="R1",
                                           subfolder="fastq",
                                           extension="fastq",
                                           create_folder=True)
        mate_2_files = bb_utils.name_files(map(os.path.basename, input_files),
                                           output_dir,
                                           tag="R2",
                                           subfolder="fastq",
                                           extension="fastq",
                                           create_folder=True)

        mate_1_files = [
            fname.replace('.fastq_R1', '_R1.fastq') for fname in mate_1_files
        ]
        mate_2_files = [
            fname.replace('.fastq_R2', '_R2.fastq') for fname in mate_2_files
        ]
        output_files = zip(mate_1_files, mate_2_files)
        reformat_cmd += "out2=[targets[1]] "
    else:
        output_files = bb_utils.name_files(map(os.path.basename, input_files),
                                           output_dir,
                                           extension=".fastq")

    reformat_cmd += "interleaved addslash=t spaceslash=f"
    workflow.add_task_group_gridable(reformat_cmd,
                                     depends=input_files,
                                     targets=output_files,
                                     args=[threads],
                                     cores=threads,
                                     time=20 * 60,
                                     mem=4098)

    fastq_files = list(
        chain.from_iterable(output_files)) if paired_end else output_files

    if compress:
        fastq_files_compress = [
            "%s.gz" % fastq_file for fastq_file in fastq_files
        ]

        workflow.add_task_group_gridable(
            "pigz --best -p [args[0]] [depends[0]]",
            depends=fastq_files,
            targets=fastq_files_compress,
            args=[threads],
            cores=threads,
            time=10 * 60,
            mem=4098)
        fastq_files = fastq_files_compress

        workflow.add_task_group("rm -rf [targets[0]]",
                                targets=sorted_bams,
                                depends=fastq_files_compress)

    return fastq_files