Exemplo n.º 1
0
def deblur(qclient, job_id, parameters, out_dir):
    """Run deblur with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run deblur
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job

    Notes
    -----
    The code will check if the artifact has a preprocessed_demux element, if
    not it will use the preprocessed_fastq. We prefer to work with the
    preprocessed_demux as running time will be greatly improved
    """
    out_dir = join(out_dir, 'deblur_out')
    # Step 1 get the rest of the information need to run deblur
    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['Demultiplexed sequences']
    # removing input from parameters so it's not part of the final command
    del parameters['Demultiplexed sequences']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Step 2 generating command deblur
    if 'preprocessed_demux' in fps:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (1/2)")

        if not exists(out_dir):
            mkdir(out_dir)
        split_out_dir = join(out_dir, 'split')
        if not exists(split_out_dir):
            mkdir(split_out_dir)

        # using the same number of parallel jobs as defined by the command
        n_jobs = int(parameters['Jobs to start'])
        # [0] cause there should be only 1 file
        to_per_sample_files(fps['preprocessed_demux'][0],
                            out_dir=split_out_dir, n_jobs=n_jobs)

        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (2/2)")
        out_dir = join(out_dir, 'deblured')
        cmd = generate_deblur_workflow_commands([split_out_dir],
                                                out_dir, parameters)
    else:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur "
                                "command")
        cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'],
                                                out_dir, parameters)

    # Step 3 execute deblur
    qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job")
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    # Generating artifact
    pb = partial(join, out_dir)

    # Generate the filepaths
    final_biom = pb('all.biom')
    final_seqs = pb('all.seqs.fa')
    final_biom_hit = pb('reference-hit.biom')
    final_seqs_hit = pb('reference-hit.seqs.fa')

    if not exists(final_biom_hit):
        # Create an empty table. We need to send something to Qiita that is
        # a valid BIOM, so we are going to create an empty table
        t = Table([], [], [])
        with biom_open(final_biom_hit, 'w') as f:
            t.to_hdf5(f, 'qp-deblur generated')

    if not exists(final_seqs_hit):
        # Same as before, create an empty sequence file so we can send it
        with open(final_seqs_hit, 'w') as f:
            f.write("")

    # Step 4, communicate with archive to check and generate placements
    qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving "
                            "observations information")
    features = list(load_table(final_biom_hit).ids(axis='observation'))

    fp_phylogeny = None
    if features:
        observations = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        novel_fragments = list(set(features) - set(observations.keys()))

        qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new "
                                "placements" % len(novel_fragments))

        # Once we support alternative reference phylogenies for SEPP in the
        # future, we need to translate the reference name here into
        # filepaths pointing to the correct reference alignment and
        # reference tree. If left 'None' the Greengenes 13.8 reference
        # shipped with the fragment-insertion conda package will be used.
        fp_reference_alignment = None
        fp_reference_phylogeny = None
        fp_reference_template = None
        fp_reference_rename = None
        if 'Reference phylogeny for SEPP' in parameters:
            if parameters['Reference phylogeny for SEPP'] == 'tiny':
                fp_reference_alignment = qp_deblur.get_data(join(
                    'sepp', 'reference_alignment_tiny.fasta'))
                fp_reference_phylogeny = qp_deblur.get_data(join(
                    'sepp', 'reference_phylogeny_tiny.nwk'))
                fp_reference_template = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_placement.json'))
                fp_reference_rename = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_rename-json.py'))
        try:
            new_placements = generate_sepp_placements(
                novel_fragments, out_dir, parameters['Threads per sample'],
                reference_alignment=fp_reference_alignment,
                reference_phylogeny=fp_reference_phylogeny)
        except ValueError as e:
            return False, None, str(e)

        qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d "
                                "new placements" % len(novel_fragments))
        # values needs to be json strings as well
        for fragment in new_placements.keys():
            new_placements[fragment] = json.dumps(new_placements[fragment])

        # fragments that get rejected by a SEPP run don't show up in
        # the placement file, however being rejected is a valuable
        # information and should be stored in the archive as well.
        # Thus, we avoid re-computation for rejected fragments in the
        # future.
        for fragment in novel_fragments:
            if fragment not in new_placements:
                new_placements[fragment] = ""
        if len(new_placements.keys()) > 0:
            qclient.patch(url="/qiita_db/archive/observations/", op="add",
                          path=job_id, value=json.dumps(new_placements))

        # retrieve all fragments and create actuall tree
        qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing "
                                "phylogenetic insertion tree")
        placements = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        # remove fragments that have been rejected by SEPP, i.e. whoes
        # placement is the empty string and
        # convert all other placements from string to json
        placements = {frag: json.loads(placements[frag])
                      for frag, plc
                      in placements.items()
                      if plc != ''}
        try:
            fp_phylogeny = generate_insertion_trees(
                placements, out_dir,
                reference_template=fp_reference_template,
                reference_rename=fp_reference_rename)
        except ValueError as e:
            return False, None, str(e)
    else:
        new_placements = None

    ainfo = [ArtifactInfo('deblur final table', 'BIOM',
                          [(final_biom, 'biom'),
                           (final_seqs, 'preprocessed_fasta')])]
    if fp_phylogeny is not None:
        ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM',
                     [(final_biom_hit, 'biom'),
                      (final_seqs_hit, 'preprocessed_fasta'),
                      (fp_phylogeny, 'plain_text')], new_placements))

    return True, ainfo, ""
Exemplo n.º 2
0
def call_qiime2(qclient, job_id, parameters, out_dir):
    """helper method to call Qiime2

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to process
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    q2plugin = parameters.pop('qp-hide-plugin')
    q2method = parameters.pop('qp-hide-method').replace('-', '_')
    pm = qiime2.sdk.PluginManager()
    method = pm.plugins[q2plugin].actions[q2method]

    out_dir = join(out_dir, q2method)

    # making sure that we always start with an empty folder
    if not exists(out_dir):
        mkdir(out_dir)

    # let's generate the parameters, first remove the hidden parameters. We are
    # going to separate in q2params and q2inputs as the inputs are going to
    # need to be retrieved from qiita and converted to qza
    label = 'qp-hide-param'
    label_len = len(label)
    q2params = {}
    q2inputs = {}
    method_inputs = method.signature.inputs.copy()
    method_params = method.signature.parameters.copy()
    artifact_id = None
    analysis_id = None
    biom_fp = None
    tree_fp = None
    tree_fp_check = False
    for k in list(parameters):
        if k in parameters and k.startswith(label):
            key = parameters.pop(k)
            val = parameters.pop(k[label_len:])
            if key in method_inputs.keys():
                if key == 'phylogeny':
                    if val == '':
                        continue
                    # there is a chance that we parse/loop over the phylogeny
                    # option before the artifact so tree_fp will still be
                    # None; thus we will need to check this after we are done
                    # with this loop
                    if val == 'Artifact tree, if exists':
                        tree_fp_check = True
                    fpath = val
                    qiita_name = QIITA_Q2_SEMANTIC_TYPE[key]
                    if qiita_name['expression']:
                        # for these cases we need an expresion so for
                        # simplicity using the first one [0]
                        artifact_method = '%s[%s]' % (
                            qiita_name['name'], qiita_name['expression'][0])
                elif key in ('classifier', 'data'):
                    fpath = val
                    artifact_method = None
                    k = key
                else:
                    # this is going to be an artifact so let's collect the
                    # filepath here, this will also allow us to collect the
                    # analysis_id
                    artifact_id = val
                    ainfo = qclient.get(
                        "/qiita_db/artifacts/%s/" % artifact_id)
                    if ainfo['analysis'] is None:
                        msg = ('Artifact "%s" is not an analysis '
                               'artifact.' % val)
                        return False, None, msg
                    analysis_id = ainfo['analysis']
                    dt = method_inputs[key].qiime_type.to_ast()['name']
                    if 'qza' not in ainfo['files']:
                        # at this stage in qiita we only have 2 types of
                        # artifacts: biom / plain_text
                        if Q2_QIITA_SEMANTIC_TYPE[dt] == 'BIOM':
                            fpath = ainfo['files']['biom'][0]
                            biom_fp = fpath
                        else:
                            fpath = ainfo['files']['plain_text'][0]
                    else:
                        fpath = ainfo['files']['qza'][0]
                    # if it's a BIOM and there is a plain_text is the
                    # result of the archive at this stage: a tree
                    if Q2_QIITA_SEMANTIC_TYPE[dt] == 'BIOM':
                        if 'plain_text' in ainfo['files']:
                            tree_fp = ainfo['files']['plain_text'][0]
                    if biom_fp is None and 'biom' in ainfo['files']:
                        biom_fp = ainfo['files']['biom'][0]

                    q2artifact_name = Q2_QIITA_SEMANTIC_TYPE[
                        method_inputs[key].qiime_type.to_ast()['name']]
                    qiita_name = QIITA_Q2_SEMANTIC_TYPE[q2artifact_name]
                    if qiita_name['expression']:
                        # for these cases we need an expresion so for
                        # simplicity using the first one [0]
                        artifact_method = '%s[%s]' % (
                            qiita_name['name'], qiita_name['expression'][0])
                    else:
                        artifact_method = qiita_name['name']

                q2inputs[key] = (fpath, artifact_method)
            elif key == 'qp-hide-metadata-field':
                if val == '':
                    msg = ("Error: You didn't write a metadata field in "
                           "'%s'" % k[label_len:])
                    return False, None, msg
                q2inputs['metadata'] = (val, val)
            else:
                if val in ('', 'None'):
                    continue

                # let's bring back the original name of these parameters
                mkey = method_params[key]
                value_pair = (q2method, key)
                if (q2plugin == 'diversity' and value_pair in RENAME_COMMANDS):
                    val = RENAME_COMMANDS[value_pair][val]
                    # if the view_type is set convert to set
                    if mkey.view_type is set:
                        val = {val}
                else:
                    val = qiime2.sdk.util.parse_primitive(
                        mkey.qiime_type.to_ast(), val)

                q2params[key] = val
        elif k in ('qp-hide-metadata', 'qp-hide-FeatureData[Taxonomy]'):
            # remember, if we need metadata, we will always have
            # qp-hide-metadata and optionaly we will have
            # qp-hide-metadata-field
            key = parameters.pop(k)
            if key in parameters:
                q2params['metadata'] = qiime2.Artifact.load(
                    parameters.pop(key)).view(qiime2.Metadata)
            else:
                q2inputs[key] = ('', '')

    # if 'metadata' is in q2inputs but 'where' exist and is empty in q2params,
    # remove the parameter metadata
    # NOTE: AFAIK there is no way to differentiate between sample and prep
    #       metadata in Q2 so the need to remove for filter_features
    if ('metadata' in q2inputs and 'where' in q2params
            and not q2params['where']):
        q2inputs.pop('metadata')

    # if we are here, we need to use the internal tree from the artifact
    if tree_fp_check:
        q2inputs['phylogeny'] = (tree_fp, q2inputs['phylogeny'][1])

    # let's process/import inputs
    qclient.update_job_step(
        job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact")
    for k, (fpath, dt) in q2inputs.items():
        if k in ('metadata', 'sample_metadata'):
            metadata = qclient.get(
                "/qiita_db/analysis/%s/metadata/" % str(analysis_id))
            metadata = pd.DataFrame.from_dict(metadata, orient='index')
            # the reason we need to save and load the mapping file is
            # so Qiime2 assings the expected data types to the columns
            metadata_fp = join(out_dir, 'metadata.txt')
            metadata.to_csv(metadata_fp, index_label='#SampleID', na_rep='',
                            sep='\t', encoding='utf-8')
            q2Metadata = qiime2.Metadata.load(metadata_fp)
            if fpath:
                q2params[k] = q2Metadata.get_column(fpath)
            else:
                q2params[k] = q2Metadata
        elif k == 'FeatureData[Taxonomy]':
            try:
                qza = qiime2.Artifact.import_data(
                    'FeatureData[Taxonomy]', biom_fp, 'BIOMV210Format')
            except Exception:
                return False, None, ('Error generating taxonomy. Are you '
                                     'sure this artifact has taxonomy?')
            q2params['taxonomy'] = qza
        elif fpath is not None:
            if not fpath.endswith('.qza'):
                try:
                    qza = qiime2.Artifact.import_data(dt, fpath)
                except Exception as e:
                    return False, None, 'Error converting "%s": %s' % (
                        str(dt), str(e))
            elif exists(fpath):
                qza = qiime2.Artifact.load(fpath)
            q2params[k] = qza
        else:
            # adding an else for completeness: if we get here then we should
            # ignore that parameter/input passed. By design, this should only
            # happen in one scenario: the user selected an artifact, in
            # specific a tree, that doesn't exist. This was added while solving
            # https://github.com/biocore/qiita/issues/3039. However, in the
            # future it might be useful to always ignore anything that doesn't
            # exits.
            pass

    # if feature_classifier and classify_sklearn we need to transform the
    # input data to sequences
    if q2plugin == 'feature-classifier' and q2method == 'classify_sklearn':
        ainfo = qclient.get("/qiita_db/artifacts/%s/" %
                            parameters['The feature data to be classified.'])
        biom_fp = ainfo['files']['biom'][0]
        plain_text_fp = None
        if 'plain_text' in ainfo['files']:
            plain_text_fp = ainfo['files']['plain_text'][0]
        biom_table = load_table(biom_fp)
        fna_fp = join(out_dir, 'sequences.fna')
        with open(fna_fp, 'w') as f:
            for _id in biom_table.ids(axis='observation'):
                f.write('>{0}\n{0}\n'.format(_id))
        try:
            q2params['reads'] = qiime2.Artifact.import_data(
                'FeatureData[Sequence]', fna_fp)
        except (ValueError, qiime2.core.exceptions.ValidationError) as e:
            msg = str(e)
            if 'DNAFASTAFormat file' in msg:
                msg = ('Table IDs are not sequences, please confirm that this '
                       'is not a closed reference table?')
            return False, None, 'Error converting "%s": %s' % (
                'Input Table', msg)

    qclient.update_job_step(
        job_id, "Step 3 of 4: Running '%s %s'" % (q2plugin, q2method))
    try:
        results = method(**q2params)
    except Exception as e:
        return False, None, 'Error running: %s' % str(e)

    qclient.update_job_step(job_id, "Step 4 of 4: Processing results")
    out_info = []

    # if feature_classifier and classify_sklearn we need to add the taxonomy
    # to the original table and generate the new artifact
    if q2plugin == 'feature-classifier' and q2method == 'classify_sklearn':
        new_biom = join(out_dir, 'feature-table-with-taxonomy.biom')
        new_qza = join(out_dir, 'feature-table-with-taxonomy.qza')
        df = results[0].view(pd.DataFrame)
        df.rename(columns={'Taxon': 'taxonomy'}, inplace=True)
        df['taxonomy'] = [[y.strip() for y in x]
                          for x in df['taxonomy'].str.split(';')]
        biom_table.add_metadata(df.to_dict(orient='index'), axis='observation')
        with biom_open(new_biom, 'w') as bf:
            biom_table.to_hdf5(bf, 'Generated in Qiita')

        qza = qiime2.Artifact.import_data(
            'FeatureTable[Frequency]', new_biom, 'BIOMV210Format')
        qza.save(new_qza)
        ftc_fps = [(new_biom, 'biom'), (new_qza, 'qza')]
        if plain_text_fp is not None:
            # if we enter here, it means that the input artifact had a tree
            # (saved as plain_text); thus, we need to make sure we make a copy
            # so we don't move the original file
            bn = basename(plain_text_fp)
            new_tree_fp = join(out_dir, bn)
            copyfile(ainfo['files']['plain_text'][0], new_tree_fp)
            ftc_fps.append((new_tree_fp, 'plain_text'))
        out_info.append(ArtifactInfo(
            'Feature Table with Classification', 'BIOM', ftc_fps))

    for aname, q2artifact in zip(results._fields, results):
        aout = join(out_dir, aname)
        if isinstance(q2artifact, qiime2.Visualization):
            qzv_fp = q2artifact.save(aout)
            out_info.append(
                ArtifactInfo(aname, 'q2_visualization', [(qzv_fp, 'qzv')]))
        else:
            qza_fp = q2artifact.save(aout + '.qza')
            q2artifact.export_data(output_dir=aout)
            files = listdir(aout)
            if len(files) != 1:
                msg = ('Error processing results: There are some unexpected '
                       'files: "%s"' % ', '.join(files))
                return False, None, msg
            fp = join(aout, files[0])
            # making sure the newly created file comes with the correct
            # permissions for nginx
            chmod(fp, 0o664)

            if (q2artifact.type.name == 'FeatureTable'):
                # Re-add the observation metadata if exists in the input and if
                # not one of the plugin/methods that actually changes that
                # information
                if biom_fp is not None and (q2plugin, q2method) not in [
                        ('taxa', 'collapse')]:
                    fin = load_table(biom_fp)
                    fout = load_table(fp)

                    # making sure that the resulting biom is not empty
                    if fout.shape == (0, 0):
                        msg = ('The resulting table is empty, please review '
                               'your parameters')
                        return False, None, msg

                    metadata = {
                        i: fin.metadata(i, axis='observation')
                        for i in fout.ids(axis='observation')}
                    fout.add_metadata(metadata, axis='observation')
                    with biom_open(fp, 'w') as bf:
                        fout.to_hdf5(bf, "Qiita's Qiime2 plugin with "
                                     "observation metadata")

                # if there is a tree, let's copy it and then add it to
                # the new artifact
                if tree_fp is not None:
                    bn = basename(tree_fp)
                    new_tree_fp = join(
                        out_dir, aout, 'from_%s_%s' % (artifact_id, bn))
                    copyfile(tree_fp, new_tree_fp)
                    ai = ArtifactInfo(aname, 'BIOM', [
                        (fp, 'biom'),
                        (new_tree_fp, 'plain_text'),
                        (qza_fp, 'qza')])
                else:
                    ai = ArtifactInfo(
                        aname, 'BIOM', [(fp, 'biom'), (qza_fp, 'qza')])

            else:
                atype = Q2_QIITA_SEMANTIC_TYPE[q2artifact.type.name]
                ai = ArtifactInfo(
                    aname, atype, [(fp, 'plain_text'), (qza_fp, 'qza')])
            out_info.append(ai)

    return True, out_info, ""
Exemplo n.º 3
0
    def test_validate_per_sample_FASTQ_preprocessed_fastq(self):
        f1 = join(self.source_dir, 'SKB2.640194_file.fastq')
        f2 = join(self.source_dir, 'SKM4.640180_file.fastq')
        f3 = join(self.source_dir, 'SKB3.640195_file.fastq')
        copyfile(self.fastq, f1)
        copyfile(self.fastq, f2)
        copyfile(self.fastq, f3)
        self._clean_up_files.append(f1)
        self._clean_up_files.append(f2)
        self._clean_up_files.append(f3)

        prep_info = {
            "1.SKB2.640194": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKM4.640180": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKB3.640195": {
                "not_a_run_prefix": "prefix2"
            }
        }
        files = {'preprocessed_fastq': [f1, f2, f3]}
        job_id, _ = self._create_template_and_job(prep_info, files,
                                                  "per_sample_FASTQ")
        obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ(
            self.qclient, job_id, prep_info, files)
        self.assertTrue(obs_success)
        filepaths = [(f1 + '.gz', 'preprocessed_fastq'),
                     (f2 + '.gz', 'preprocessed_fastq'),
                     (f3 + '.gz', 'preprocessed_fastq')]
        exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)]
        self.assertEqual(obs_ainfo, exp)
        self.assertEqual(obs_error, "")
        # making sure the regular fastq files doesn't exist anymore but
        # the gz do
        self.assertFalse(exists(f1))
        self.assertTrue(exists(f1 + '.gz'))

        f1 = join(self.source_dir, 'SKB2.640194_file_R1.fastq')
        f2 = join(self.source_dir, 'SKB2.640194_file_R2.fastq')
        f3 = join(self.source_dir, 'SKB2.640194_file_unmatched_R1.fastq')
        f4 = join(self.source_dir, 'SKB2.640194_file_unmatched_R2.fastq')
        f5 = join(self.source_dir, 'SKM4.640180_file_R1.fastq')
        f6 = join(self.source_dir, 'SKM4.640180_file_R2.fastq')
        f7 = join(self.source_dir, 'SKM4.640180_file_unmatched_R1.fastq')
        f8 = join(self.source_dir, 'SKM4.640180_file_unmatched_R2.fastq')
        f9 = join(self.source_dir, 'SKB3.640195_file_R1.fastq')
        fA = join(self.source_dir, 'SKB3.640195_file_R2.fastq')
        fB = join(self.source_dir, 'SKB3.640195_file_unmatched_R1.fastq')
        fC = join(self.source_dir, 'SKB3.640195_file_unmatched_R2.fastq')
        raw_files = [f1, f2, f3, f4, f5, f6, f7, f8, f9, fA, fB, fC]
        for x in raw_files:
            copyfile(self.fastq, x)
            self._clean_up_files.append(x)

        prep_info = {
            "1.SKB2.640194": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKM4.640180": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKB3.640195": {
                "not_a_run_prefix": "prefix2"
            }
        }
        files = {'preprocessed_fastq': raw_files}
        job_id, _ = self._create_template_and_job(prep_info, files,
                                                  "per_sample_FASTQ")
        obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ(
            self.qclient, job_id, prep_info, files)
        self.assertEqual(obs_error, "")
        self.assertTrue(obs_success)
        filepaths = [('%s.gz' % x, 'preprocessed_fastq') for x in raw_files]
        exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)]
        self.assertEqual(obs_ainfo, exp)
Exemplo n.º 4
0
    def test_validate(self):
        # Test artifact type error
        job_id, params = self._create_job(
            'NotAType', {'plan_text': 'Will fail before checking this'}, 1)
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     params, self.out_dir)
        self.assertFalse(obs_success)
        self.assertIsNone(obs_ainfo)
        self.assertEqual(
            obs_error, "Unknown artifact type NotAType. Supported types: "
            "alpha_vector, distance_matrix, ordination_results")

        # Test missing metadata error - to be fair, I don't know how this error
        # can happen in the live system, but better be safe than sorry
        job_id, params = self._create_job(
            'distance_matrix', {'plan_text': 'Will fail before checking this'},
            None)
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     params, self.out_dir)
        self.assertFalse(obs_success)
        self.assertIsNone(obs_ainfo)
        self.assertEqual(obs_error, "Missing metadata information")

        # Test distance matrix success
        sample_ids = [
            '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192',
            '1.SKB7.640196'
        ]
        dm_fp = self._create_distance_matrix(sample_ids)
        job_id, params = self._create_job('distance_matrix',
                                          {'plain_text': [dm_fp]}, 1)
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     params, self.out_dir)
        self.assertTrue(obs_success)
        html_fp = join(self.out_dir, 'index.html')
        exp_ainfo = [
            ArtifactInfo(None, "distance_matrix", [(dm_fp, 'plain_text'),
                                                   (html_fp, 'html_summary')])
        ]
        self.assertEqual(obs_ainfo, exp_ainfo)
        self.assertEqual(obs_error, "")

        # Test ordination results success
        ord_res_fp = self._create_ordination_results(sample_ids)
        job_id, params = self._create_job('ordination_results',
                                          {'plain_text': [ord_res_fp]}, 1)
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     params, self.out_dir)
        self.assertTrue(obs_success)
        html_fp = join(self.out_dir, 'index.html')
        esf_fp = join(self.out_dir, 'emperor_support_files')
        exp_ainfo = [
            ArtifactInfo(None,
                         "ordination_results", [(ord_res_fp, 'plain_text'),
                                                (html_fp, 'html_summary'),
                                                (esf_fp, 'html_summary_dir')])
        ]
        self.assertEqual(obs_ainfo, exp_ainfo)
        self.assertEqual(obs_error, "")

        # Test alpha vector success
        alpha_vector_fp = self._create_alpha_vector(sample_ids)
        job_id, params = self._create_job('alpha_vector',
                                          {'plain_text': [alpha_vector_fp]}, 1)
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     params, self.out_dir)
        self.assertTrue(obs_success)
        html_fp = join(self.out_dir, 'index.html')
        sf_fp = join(self.out_dir, 'support_files')
        exp_ainfo = [
            ArtifactInfo(None,
                         "alpha_vector", [(alpha_vector_fp, 'plain_text'),
                                          (html_fp, 'html_summary'),
                                          (sf_fp, 'html_summary_dir')])
        ]
        self.assertEqual(obs_ainfo, exp_ainfo)
        self.assertEqual(obs_error, "")
Exemplo n.º 5
0
def woltka(qclient, job_id, parameters, out_dir):
    """Run Woltka with the given parameters

    Parameters
    ----------
    qclient : tgp.qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run split libraries
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list, str
        The results of the job
    """
    database_taxonomy, database_gene_coordinates = _process_database_files(
        parameters['Database'])

    errors = []
    ainfo = []
    fp_biom = f'{out_dir}/free.biom'
    fp_alng = f'{out_dir}/alignment.tar'
    if exists(fp_biom) and exists(fp_alng):
        ainfo = [
            ArtifactInfo('Alignment Profile', 'BIOM', [(fp_biom, 'biom'),
                                                       (fp_alng, 'log')])
        ]
    else:
        ainfo = []
        errors.append('Missing files from the "Alignment Profile"; please '
                      'contact [email protected] for more information')

    for rank in ['phylum', 'genus', 'species']:
        fp = f'{out_dir}/{rank}.biom'

        if exists(fp):
            # making sure that the tables have taxonomy
            bt = load_table(fp)
            metadata = {
                x: {
                    'taxonomy': x.split(';')
                }
                for x in bt.ids(axis='observation')
            }
            bt.add_metadata(metadata, axis='observation')
            with biom_open(fp, 'w') as f:
                bt.to_hdf5(f, "woltka")

            ainfo.append(
                ArtifactInfo(f'Taxonomic Predictions - {rank}', 'BIOM',
                             [(fp, 'biom')]))
        else:
            errors.append(f'Table {rank} was not created, please contact '
                          '[email protected] for more information')

    fp_biom = f'{out_dir}/none.biom'
    if exists(fp_biom):
        ainfo.append(
            ArtifactInfo('Per genome Predictions', 'BIOM',
                         [(fp_biom, 'biom')]))
    else:
        errors.append('Table none/per-genome was not created, please contact '
                      '[email protected] for more information')

    if database_gene_coordinates is not None:
        fp_biom = f'{out_dir}/per-gene.biom'
        if exists(fp_biom):
            ainfo.append(
                ArtifactInfo('Per gene Predictions', 'BIOM',
                             [(fp_biom, 'biom')]))
        else:
            errors.append('Table per-gene was not created, please contact '
                          '[email protected] for more information')

    if errors:
        return False, ainfo, '\n'.join(errors)
    else:

        return True, ainfo, ""
Exemplo n.º 6
0
def validate(qclient, job_id, parameters, out_dir):
    """Validate and fix a new BIOM artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to validate and create the artifact
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list of qiita_client.ArtifactInfo , str
        Whether the job is successful
        The artifact information, if successful
        The error message, if not successful
    """
    prep_id = parameters.get('template')
    analysis_id = parameters.get('analysis')
    files = loads(parameters['files'])
    a_type = parameters['artifact_type']

    if a_type != "BIOM":
        return (False, None, "Unknown artifact type %s. Supported types: BIOM"
                             % a_type)

    qclient.update_job_step(job_id, "Step 1: Collecting metadata")
    if prep_id is not None:
        metadata = qclient.get("/qiita_db/prep_template/%s/data/" % prep_id)
        metadata = metadata['data']
    elif analysis_id is not None:
        metadata = qclient.get("/qiita_db/analysis/%s/metadata/" % analysis_id)
    else:
        return (False, None, "Missing metadata information")

    # Check if the biom table has the same sample ids as the prep info
    qclient.update_job_step(job_id, "Step 2: Validting BIOM file")
    new_biom_fp = biom_fp = files['biom'][0]
    table = load_table(biom_fp)
    metadata_ids = set(metadata)
    biom_sample_ids = set(table.ids())

    if not metadata_ids.issuperset(biom_sample_ids):
        # The BIOM sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing BIOM sample ids")
        # Attempt 1: the user provided the run prefix column - in this case
        # the run prefix column holds the sample ids present in the BIOM file
        if 'run_prefix' in metadata[next(iter(metadata_ids))]:
            id_map = {v['run_prefix']: k for k, v in metadata.items()}
        else:
            # Attemp 2: the sample ids in the BIOM table are the same that in
            # the prep template but without the prefix
            prefix = next(iter(metadata_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in biom_sample_ids)
            if metadata_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in biom_sample_ids}
            else:
                # There is nothing we can do. The samples in the BIOM table do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the BIOM table do not match '
                             'the ones in the prep information. Please, '
                             'provide the column "run_prefix" in the prep '
                             'information to map the existing sample ids to '
                             'the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        try:
            table.update_ids(id_map, axis='sample')
        except TableException:
            missing = biom_sample_ids - set(id_map)
            error_msg = ('Your prep information is missing samples that are '
                         'present in your BIOM table: %s' % ', '.join(missing))
            return False, None, error_msg

        new_biom_fp = join(out_dir, basename(biom_fp))
        with biom_open(new_biom_fp, 'w') as f:
            table.to_hdf5(f, "Qiita BIOM type plugin")

    filepaths = [(new_biom_fp, 'biom')]

    # Validate the representative set, if it exists
    if 'preprocessed_fasta' in files:
        repset_fp = files['preprocessed_fasta'][0]

        # The observations ids of the biom table should be the same
        # as the representative sequences ids found in the representative set
        observation_ids = table.ids(axis='observation').tolist()
        extra_ids = []
        for record in load([repset_fp], constructor=FastaIterator):
            rec_id = record['SequenceID'].split()[0]
            try:
                observation_ids.remove(rec_id)
            except ValueError:
                extra_ids.append(rec_id)

        error_msg = []
        if extra_ids:
            error_msg.append("The representative set sequence file includes "
                             "observations not found in the BIOM table: %s"
                             % ', '.join(extra_ids))
        if observation_ids:
            error_msg.append("The representative set sequence file is missing "
                             "observation ids found in the BIOM tabe: %s" %
                             ', '.join(observation_ids))

        if error_msg:
            return False, None, '\n'.join(error_msg)

        filepaths.append((repset_fp, 'preprocessed_fasta'))

    for fp_type, fps in files.items():
        if fp_type not in ('biom', 'preprocessed_fasta'):
            for fp in fps:
                filepaths.append((fp, fp_type))

    return True, [ArtifactInfo(None, 'BIOM', filepaths)], ""
Exemplo n.º 7
0
def deblur(qclient, job_id, parameters, out_dir):
    """Run deblur with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run deblur
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job

    Notes
    -----
    The code will check if the artifact has a preprocessed_demux element, if
    not it will use the preprocessed_fastq. We prefer to work with the
    preprocessed_demux as running time will be greatly improved
    """
    out_dir = join(out_dir, 'deblur_out')
    # Step 1 get the rest of the information need to run deblur
    qclient.update_job_step(job_id, "Step 1 of 3: Collecting information")
    artifact_id = parameters['seqs-fp']
    # removing input from parameters so it's not part of the final command
    del parameters['seqs-fp']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Step 2 generating command deblur
    if 'preprocessed_demux' in fps:
        qclient.update_job_step(
            job_id, "Step 2 of 3: Generating per sample "
            "from demux (1/2)")

        if not exists(out_dir):
            mkdir(out_dir)
        split_out_dir = join(out_dir, 'split')
        if not exists(split_out_dir):
            mkdir(split_out_dir)

        # using the same number of parallel jobs as defined by the command
        n_jobs = parameters['jobs-to-start']
        # [0] cause there should be only 1 file
        to_per_sample_files(fps['preprocessed_demux'][0],
                            out_dir=split_out_dir,
                            n_jobs=n_jobs)

        qclient.update_job_step(
            job_id, "Step 2 of 3: Generating per sample "
            "from demux (2/2)")
        out_dir = join(out_dir, 'deblured')
        cmd = generate_deblur_workflow_commands([split_out_dir], out_dir,
                                                parameters)
    else:
        qclient.update_job_step(job_id, "Step 2 of 3: Generating deblur "
                                "command")
        cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'],
                                                out_dir, parameters)

    # Step 3 execute deblur
    qclient.update_job_step(job_id, "Step 3 of 3: Executing deblur job")
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" %
                     (std_out, std_err))
        return False, None, error_msg

    # Generating artifact
    pb = partial(join, out_dir)

    # Generate the filepaths
    final_biom = pb('final.biom')
    final_seqs = pb('final.seqs.fa')
    final_biom_16s = pb('final.only-16s.biom')
    final_seqs_na = pb('final.seqs.fa.no_artifacts')

    if not exists(final_biom_16s):
        # Create an empty table. We need to send something to Qiita that is
        # a valid BIOM, so we are going to create an empty table
        t = Table([], [], [])
        with biom_open(final_biom_16s, 'w') as f:
            t.to_hdf5(f, 'qp-deblur generated')

    if not exists(final_seqs_na):
        # Same as before, create an empty sequence file so we can send it
        with open(final_seqs_na, 'w') as f:
            f.write("")

    ainfo = [
        ArtifactInfo('deblur final table',
                     'BIOM', [(final_biom, 'biom'),
                              (final_seqs, 'preprocessed_fasta')]),
        ArtifactInfo('deblur 16S only table', 'BIOM',
                     [(final_biom_16s, 'biom'),
                      (final_seqs_na, 'preprocessed_fasta')])
    ]

    return True, ainfo, ""
Exemplo n.º 8
0
def _validate_demux_file(qclient,
                         job_id,
                         prep_info,
                         out_dir,
                         demux_fp,
                         fastq_fp=None,
                         fasta_fp=None,
                         log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with open_file(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                error_msg = ('The sample ids in the "run_prefix" columns '
                             'from the prep information do not match the '
                             'ones in the demux file. Please, correct the '
                             'column "run_prefix" in the prep information to '
                             'map the existing sample ids to the prep '
                             'information sample ids.')
                return False, None, error_msg
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the demultiplexed files do '
                             'not match the ones in the prep information. '
                             'Please, provide the column "run_prefix" in '
                             'the prep information to map the existing sample'
                             ' ids to the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        # this if is important so we don't regenerate the demux file if the
        # user uploads fastq or fna
        if demux_fp != new_demux_fp:
            copy(demux_fp, new_demux_fp)
            demux_fp = new_demux_fp

        with open_file(demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])
        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        to_ascii_file(demux_fp, fastq_fp, out_format='fastq')
        fastq_fp, error_msg = _gzip_file(fastq_fp)
        if error_msg is not None:
            return False, None, error_msg

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        to_ascii_file(demux_fp, fasta_fp, out_format='fasta')
        fasta_fp, error_msg = _gzip_file(fasta_fp)
        if error_msg is not None:
            return False, None, error_msg

    filepaths = [(fastq_fp, 'preprocessed_fastq'),
                 (fasta_fp, 'preprocessed_fasta'),
                 (demux_fp, 'preprocessed_demux')]
    if log_fp:
        filepaths.append((log_fp, 'log'))
    return True, [ArtifactInfo(None, 'Demultiplexed', filepaths)], ""
Exemplo n.º 9
0
    def test_validate_representative_set(self):
        sample_ids = [
            '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176',
            '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201',
            '1.SKM2.640199'
        ]
        biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids,
                                                                template=1)

        fd, fasta_fp = mkstemp(suffix=".fna")
        close(fd)
        with open(fasta_fp, 'w') as f:
            f.write(">O1 something\nACTG\n>O2\nATGC\n")
        self._clean_up_files.append(fasta_fp)
        exp_fp = partial(join, self.out_dir)
        exp_index_fp = exp_fp('index.html')
        exp_viz_fp = exp_fp('support_files')
        exp_qza_fp = exp_fp('feature-table.qza')
        with open(exp_index_fp, 'w') as f:
            f.write("my html")
        mkdir(exp_viz_fp)

        parameters = {
            'template': parameters['template'],
            'files': dumps({
                'biom': [biom_fp],
                'preprocessed_fasta': [fasta_fp]
            }),
            'artifact_type': 'BIOM'
        }

        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     parameters, self.out_dir)
        self.assertTrue(obs_success)
        files = [(biom_fp, 'biom'), (fasta_fp, 'preprocessed_fasta'),
                 (exp_index_fp, 'html_summary'),
                 (exp_viz_fp, 'html_summary_dir'), (exp_qza_fp, 'qza')]
        self.assertEqual(obs_ainfo, [ArtifactInfo(None, 'BIOM', files)])
        self.assertEqual(obs_error, "")

        # Extra ids
        with open(fasta_fp, 'w') as f:
            f.write(">O1 something\nACTG\n>O2\nATGC\n>O3\nATGC\n")
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     parameters, self.out_dir)
        self.assertFalse(obs_success)
        self.assertIsNone(obs_ainfo)
        self.assertEqual(
            obs_error,
            "The representative set sequence file includes observations not "
            "found in the BIOM table: O3")

        # Missing ids
        with open(fasta_fp, 'w') as f:
            f.write(">O1 something\nACTG\n")
        obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id,
                                                     parameters, self.out_dir)
        self.assertFalse(obs_success)
        self.assertIsNone(obs_ainfo)
        self.assertEqual(
            obs_error,
            "The representative set sequence file is missing observation ids "
            "found in the BIOM tabe: O2")
Exemplo n.º 10
0
def shogun(qclient, job_id, parameters, out_dir):
    """Run Shogun with the given parameters

    Parameters
    ----------
    qclient : tgp.qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run split libraries
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list, str
        The results of the job
    """
    # Step 1 get the rest of the information need to run Atropos
    qclient.update_job_step(job_id, "Step 1 of 7: Collecting information")
    artifact_id = parameters['input']
    del parameters['input']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Get the artifact metadata
    prep_info = qclient.get('/qiita_db/prep_template/%s/' %
                            artifact_info['prep_information'][0])
    qiime_map = prep_info['qiime-map']

    # Step 2 converting to fna
    qclient.update_job_step(job_id,
                            "Step 2 of 7: Converting to FNA for Shogun")

    with TemporaryDirectory(dir=out_dir, prefix='shogun_') as temp_dir:
        rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else []
        samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs,
                                             qiime_map)

        # Combining files
        comb_fp = generate_fna_file(temp_dir, samples)

        # Formatting parameters
        parameters = _format_params(parameters, SHOGUN_PARAMS)

        # Step 3 align
        sys_msg = "Step 3 of 7: Aligning FNA with Shogun (%d/{0})"
        align_cmd = generate_shogun_align_commands(comb_fp, temp_dir,
                                                   parameters)
        success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg,
                                     'Shogun Align')

        if not success:
            return False, None, msg

        # Step 4 taxonomic profile
        sys_msg = "Step 4 of 7: Taxonomic profile with Shogun (%d/{0})"
        assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands(
            temp_dir, parameters)
        success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg,
                                     'Shogun taxonomy assignment')

        if not success:
            return False, None, msg

        # Step 5 redistribute profile
        sys_msg = "Step 5 of 7: Redistributed profile with Shogun (%d/{0})"
        levels = ['genus', 'species', 'strain']
        redist_fps = []
        for level in levels:
            redist_cmd, output = generate_shogun_redist_commands(
                profile_fp, temp_dir, parameters, level)
            redist_fps.append(output)
            success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg,
                                         'Shogun redistribute')
            if not success:
                return False, None, msg

        # Step 6 functional profile
        sys_msg = "Step 6 of 7: Functional profile with Shogun (%d/{0})"
        levels = ['species']
        func_fp = ''
        for level in levels:
            func_cmd, output = generate_shogun_functional_commands(
                profile_fp, temp_dir, parameters, level)
            func_fp = output
            success, msg = _run_commands(qclient, job_id, func_cmd, sys_msg,
                                         'Shogun functional')
            if not success:
                return False, None, msg
        # Step 6 functional profile
        sys_msg = "Step 7 of 7: Converting results to BIOM (%d/{0})"
        func_biom_outputs = []
        redist_biom_outputs = []
        # Converting redistributed files to biom
        redist_levels = ['genus', 'species', 'strain']
        for redist_fp, level in zip(redist_fps, redist_levels):
            biom_cmd, output = generate_biom_conversion_commands(
                redist_fp, out_dir, level, 'redist')
            success, msg = _run_commands(qclient, job_id, biom_cmd, sys_msg,
                                         'Redistribute Biom conversion')
            if not success:
                return False, None, msg
            else:
                redist_biom_outputs.append(output)
        # Coverting funcitonal files to biom
        for level in levels:

            func_to_biom_fps = [
                "kegg.modules.coverage", "kegg.modules",
                "kegg.pathways.coverage", "kegg.pathways", "kegg", "normalized"
            ]
            for biom_in in func_to_biom_fps:
                biom_in_fp = join(func_fp,
                                  "profile.%s.%s.txt" % (level, biom_in))
                biom_cmd, output = generate_biom_conversion_commands(
                    biom_in_fp, out_dir, level, biom_in)
                success, msg = _run_commands(qclient, job_id, biom_cmd,
                                             sys_msg,
                                             ' Functional Biom conversion')
                if not success:
                    return False, None, msg
                else:
                    func_biom_outputs.append(output)
    func_files_type_name = 'Functional Predictions'
    redist_files_type_name = 'Taxonomic Predictions'
    ainfo = [
        ArtifactInfo(func_files_type_name, 'BIOM', func_biom_outputs),
        ArtifactInfo(redist_files_type_name, 'BIOM', redist_biom_outputs)
    ]

    return True, ainfo, ""
Exemplo n.º 11
0
    def test_validate_per_sample_FASTQ_preprocessed_fastq(self):
        prep_info = {
            "1.SKB2.640194": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKM4.640180": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKB3.640195": {
                "not_a_run_prefix": "prefix2"
            }
        }
        files = {
            'preprocessed_fastq': [
                '/path/to/SKB2.640194_file.fastq',
                '/path/to/SKM4.640180_file.fastq',
                '/path/to/SKB3.640195_file.fastq'
            ]
        }
        job_id = self._create_template_and_job(prep_info, files,
                                               "per_sample_FASTQ")
        obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ(
            self.qclient, job_id, prep_info, files)
        self.assertTrue(obs_success)
        filepaths = [('/path/to/SKB2.640194_file.fastq', 'preprocessed_fastq'),
                     ('/path/to/SKM4.640180_file.fastq', 'preprocessed_fastq'),
                     ('/path/to/SKB3.640195_file.fastq', 'preprocessed_fastq')]
        exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)]
        self.assertEqual(obs_ainfo, exp)
        self.assertEqual(obs_error, "")

        prep_info = {
            "1.SKB2.640194": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKM4.640180": {
                "not_a_run_prefix": "prefix1"
            },
            "1.SKB3.640195": {
                "not_a_run_prefix": "prefix2"
            }
        }
        files = {
            'preprocessed_fastq': [
                '/path/to/SKB2.640194_file_R1.fastq',
                '/path/to/SKB2.640194_file_R2.fastq',
                '/path/to/SKB2.640194_file_unmatched_R1.fastq',
                '/path/to/SKB2.640194_file_unmatched_R2.fastq',
                '/path/to/SKM4.640180_file_R1.fastq',
                '/path/to/SKM4.640180_file_R2.fastq',
                '/path/to/SKM4.640180_file_unmatched_R1.fastq',
                '/path/to/SKM4.640180_file_unmatched_R2.fastq',
                '/path/to/SKB3.640195_file_R1.fastq',
                '/path/to/SKB3.640195_file_R2.fastq',
                '/path/to/SKB3.640195_file_unmatched_R1.fastq',
                '/path/to/SKB3.640195_file_unmatched_R2.fastq'
            ]
        }
        job_id = self._create_template_and_job(prep_info, files,
                                               "per_sample_FASTQ")
        obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ(
            self.qclient, job_id, prep_info, files)
        self.assertTrue(obs_success)
        filepaths = [
            ('/path/to/SKB2.640194_file_R1.fastq', 'preprocessed_fastq'),
            ('/path/to/SKB2.640194_file_R2.fastq', 'preprocessed_fastq'),
            ('/path/to/SKB2.640194_file_unmatched_R1.fastq',
             'preprocessed_fastq'),
            ('/path/to/SKB2.640194_file_unmatched_R2.fastq',
             'preprocessed_fastq'),
            ('/path/to/SKM4.640180_file_R1.fastq', 'preprocessed_fastq'),
            ('/path/to/SKM4.640180_file_R2.fastq', 'preprocessed_fastq'),
            ('/path/to/SKM4.640180_file_unmatched_R1.fastq',
             'preprocessed_fastq'),
            ('/path/to/SKM4.640180_file_unmatched_R2.fastq',
             'preprocessed_fastq'),
            ('/path/to/SKB3.640195_file_R1.fastq', 'preprocessed_fastq'),
            ('/path/to/SKB3.640195_file_R2.fastq', 'preprocessed_fastq'),
            ('/path/to/SKB3.640195_file_unmatched_R1.fastq',
             'preprocessed_fastq'),
            ('/path/to/SKB3.640195_file_unmatched_R2.fastq',
             'preprocessed_fastq')
        ]
        exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)]
        self.assertEqual(obs_ainfo, exp)
        self.assertEqual(obs_error, "")
Exemplo n.º 12
0
def _validate_multiple(qclient, job_id, prep_info, files, atype):
    """Validate and fix a new 'SFF', 'FASTQ', 'FASTA' or 'FASTA_Sanger' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    atype: str
        The type of the artifact

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating '%s' files" % atype)
    req_fp_types, opt_fp_types = FILEPATH_TYPE_DICT[atype]
    all_fp_types = req_fp_types | opt_fp_types

    # Check if there is any filepath type that is not supported
    unsupported_fp_types = set(files) - all_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact "
                     "type %s. Supported filepath types: %s" %
                     (', '.join(unsupported_fp_types), atype, ', '.join(
                         sorted(all_fp_types))))
        return False, None, error_msg

    # Check if the run_prefix column is present in the prep info
    offending = {}
    types_seen = set()
    if 'run_prefix' in prep_info[next(iter(prep_info))]:
        # We can potentially have more than one lane in the prep information
        # so check that the provided files are prefixed with the values in
        # the run_prefix column
        run_prefixes = set(v['run_prefix'] for k, v in prep_info.items())
        num_prefixes = len(run_prefixes)

        # Check those filepath types that are required
        for ftype, t_files in files.items():
            # SFF is an special case cause we can have multiple files with
            # the same prefix
            if num_prefixes != len(t_files) and atype != 'SFF':
                offending[ftype] = (
                    "The number of provided files (%d) doesn't match the "
                    "number of run prefix values in the prep info (%d): %s" %
                    (len(t_files), num_prefixes, ', '.join(
                        basename(f) for f in t_files)))
            else:
                rps = []
                fps = []
                for fp in t_files:
                    bn = basename(fp)
                    found = [rp for rp in run_prefixes if bn.startswith(rp)]
                    if found:
                        rps.extend(found)
                    else:
                        fps.append(bn)
                if fps:
                    offending[ftype] = (
                        "The provided files do not match the run prefix "
                        "values in the prep information: %s" % ', '.join(fps))
                else:
                    rps = run_prefixes - set(rps)
                    if rps:
                        offending[ftype] = (
                            "The following run prefixes in the prep "
                            "information file do not match any file: %s" %
                            ', '.join(rps))

            types_seen.add(ftype)
    else:
        # If the run prefix column is not provided, we only allow a single
        # lane, so check that we have a single file for each provided
        # filepath type
        for ftype, t_files in files.items():
            if len(t_files) != 1:
                offending[ftype] = (
                    "Only one file per type is allowed. Please provide the "
                    "column 'run_prefix' if you need more than one file per "
                    "type: %s" % ', '.join(basename(fp) for fp in t_files))

            types_seen.add(ftype)

    # Check that all required filepath types where present
    missing = req_fp_types - types_seen
    if missing:
        error_msg = ("Missing required filepath type(s): %s" %
                     ', '.join(missing))
        return False, None, error_msg

    # Check if there was any offending file
    if offending:
        error_list = ["%s: %s" % (k, v) for k, v in offending.items()]
        error_msg = ("Error creating artifact. Offending files:\n%s" %
                     '\n'.join(error_list))
        return False, None, error_msg

    # Everything is ok
    filepaths = []
    for fps_type, fps in files.items():
        filepaths.extend([(fp, fps_type) for fp in fps])

    return True, [ArtifactInfo(None, atype, filepaths)], ""
Exemplo n.º 13
0
def shogun(qclient, job_id, parameters, out_dir):
    """Run Shogun with the given parameters

    Parameters
    ----------
    qclient : tgp.qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run split libraries
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list, str
        The results of the job
    """
    # Step 1 get the rest of the information need to run Atropos
    qclient.update_job_step(job_id, "Step 1 of 6: Collecting information")
    artifact_id = parameters['input']
    del parameters['input']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Get the artifact metadata
    prep_info = qclient.get('/qiita_db/prep_template/%s/' %
                            artifact_info['prep_information'][0])
    qiime_map = prep_info['qiime-map']

    # Step 2 converting to fna
    qclient.update_job_step(job_id,
                            "Step 2 of 6: Converting to FNA for Shogun")

    rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else []
    samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs,
                                         qiime_map)

    # Combining files
    comb_fp = generate_fna_file(out_dir, samples)

    # Formatting parameters
    parameters = _format_params(parameters, SHOGUN_PARAMS)

    # Step 3 align
    align_cmd = generate_shogun_align_commands(comb_fp, out_dir, parameters)
    sys_msg = "Step 3 of 6: Aligning FNA with Shogun (%d/{0})".format(
        len(align_cmd))
    success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg,
                                 'Shogun Align')

    if not success:
        return False, None, msg

    # Step 4 taxonomic profile
    sys_msg = "Step 4 of 6: Taxonomic profile with Shogun (%d/{0})"
    assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands(
        out_dir, parameters)
    success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg,
                                 'Shogun taxonomy assignment')
    if not success:
        return False, None, msg

    sys_msg = "Step 5 of 6: Compressing and converting alignment to BIOM"
    qclient.update_job_step(job_id, msg)
    alignment_fp = join(
        out_dir, 'alignment.%s.%s' %
        (parameters['aligner'], ALN2EXT[parameters['aligner']]))
    xz_cmd = 'xz -9 -T%s %s' % (parameters['threads'], alignment_fp)
    std_out, std_err, return_value = system_call(xz_cmd)
    if return_value != 0:
        error_msg = ("Error during %s:\nStd out: %s\nStd err: %s"
                     "\n\nCommand run was:\n%s" %
                     (sys_msg, std_out, std_err, xz_cmd))
        return False, None, error_msg
    output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir,
                                'profile')

    ainfo = [
        ArtifactInfo('Shogun Alignment Profile', 'BIOM',
                     [(output, 'biom'), ('%s.xz' % alignment_fp, 'log')])
    ]

    # Step 5 redistribute profile
    sys_msg = "Step 6 of 6: Redistributed profile with Shogun (%d/{0})"
    levels = ['phylum', 'genus', 'species']
    redist_fps = []
    for level in levels:
        redist_cmd, output = generate_shogun_redist_commands(
            profile_fp, out_dir, parameters, level)
        redist_fps.append(output)
        success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg,
                                     'Shogun redistribute')
        if not success:
            return False, None, msg
    # Converting redistributed files to biom
    for redist_fp, level in zip(redist_fps, levels):
        biom_in = ["redist", None, '', True]
        output = run_shogun_to_biom(redist_fp, biom_in, out_dir, level,
                                    'redist')
        aname = 'Taxonomic Predictions - %s' % level
        ainfo.append(ArtifactInfo(aname, 'BIOM', [(output, 'biom')]))

    return True, ainfo, ""
Exemplo n.º 14
0
def humann2(qclient, job_id, parameters, out_dir):
    """Run humann2 with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run HUMAnN2
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    # Step 1 get the rest of the information need to run humann2
    qclient.update_job_step(job_id, "Step 1 of 6: Collecting information")
    artifact_id = parameters['input']
    # removing input from parameters so it's not part of the final command
    del parameters['input']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Get the artifact metadata
    prep_info = qclient.get('/qiita_db/prep_template/%s/' %
                            artifact_info['prep_information'][0])
    qiime_map = prep_info['qiime-map']

    # Step 2 generating command humann2
    qclient.update_job_step(job_id, "Step 2 of 6: Generating HUMANn2 command")
    rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else []
    commands = generate_humann2_analysis_commands(fps['raw_forward_seqs'], rs,
                                                  qiime_map, out_dir,
                                                  parameters)

    # Step 3 execute humann2
    msg = "Step 3 of 6: Executing HUMANn2 job (%d/{0})".format(len(commands))
    success, msg = _run_commands(qclient, job_id, commands, msg)
    if not success:
        return False, None, msg

    # Step 4 merge tables
    commands = []
    commands.append(('humann2_join_tables -i {0} -o {0}/genefamilies.biom '
                     '--file_name genefamilies --search-subdirectories '
                     '--verbose').format(out_dir))
    commands.append(('humann2_join_tables -i {0} -o {0}/pathcoverage.biom '
                     '--file_name pathcoverage --search-subdirectories '
                     '--verbose').format(out_dir))
    commands.append(('humann2_join_tables -i {0} -o {0}/pathabundance.biom '
                     '--file_name pathabundance --search-subdirectories '
                     '--verbose').format(out_dir))
    msg = "Step 4 of 6: Merging resulting tables job (%d/3)"
    success, msg = _run_commands(qclient, job_id, commands, msg)
    if not success:
        return False, None, msg

    # Step 5 generating re-normalized tables
    commands = []
    commands.append(('humann2_renorm_table -i {0}/genefamilies.biom -u cpm '
                     '-o {0}/genefamilies_cpm.biom').format(out_dir))
    commands.append(('humann2_renorm_table -i {0}/pathcoverage.biom -u relab '
                     '-o {0}/pathcoverage_relab.biom').format(out_dir))
    commands.append(('humann2_renorm_table -i {0}/pathabundance.biom -u relab '
                     '-o {0}/pathabundance_relab.biom').format(out_dir))
    msg = "Step 5 of 6: Re-normalizing tables (%d/3)"
    success, msg = _run_commands(qclient, job_id, commands, msg)
    if not success:
        return False, None, msg

    # Step 6 stratifiying re-normalized tables
    commands = []
    pb = partial(join, out_dir)
    cmd = "humann2_split_stratified_table --input %s --output %s"
    commands.append(cmd % (pb(out_dir, 'genefamilies_cpm.biom'), out_dir))
    commands.append(cmd % (pb(out_dir, 'pathcoverage_relab.biom'), out_dir))
    commands.append(cmd % (pb(out_dir, 'pathabundance_relab.biom'), out_dir))
    msg = "Step 6 of 6: Stratifiying re-normalizing tables (%d/3)"
    success, msg = _run_commands(qclient, job_id, commands, msg)
    if not success:
        return False, None, msg

    # Generating 6 artifacts, separation is important for analysis
    ainfo = [
        ArtifactInfo('Gene family table', 'BIOM',
                     [(pb('genefamilies.biom'), 'biom')]),
        ArtifactInfo('Path coverage table', 'BIOM',
                     [(pb('pathcoverage.biom'), 'biom')]),
        ArtifactInfo('Path abundance table', 'BIOM',
                     [(pb('pathabundance.biom'), 'biom')]),
        ArtifactInfo('Gene family CMP table', 'BIOM',
                     [(pb('genefamilies_cpm.biom'), 'biom')]),
        ArtifactInfo('Path coverage RELAB table', 'BIOM',
                     [(pb('pathcoverage_relab.biom'), 'biom')]),
        ArtifactInfo('Path abundance RELAB table', 'BIOM',
                     [(pb('pathabundance_relab.biom'), 'biom')]),
        ArtifactInfo('Gene family CMP table - stratified', 'BIOM',
                     [(pb('genefamilies_cpm_stratified.biom'), 'biom')]),
        ArtifactInfo('Path coverage RELAB table - stratified', 'BIOM',
                     [(pb('pathcoverage_relab_stratified.biom'), 'biom')]),
        ArtifactInfo('Path abundance RELAB table - stratified', 'BIOM',
                     [(pb('pathabundance_relab_stratified.biom'), 'biom')]),
        ArtifactInfo('Gene family CMP table - unstratified', 'BIOM',
                     [(pb('genefamilies_cpm_unstratified.biom'), 'biom')]),
        ArtifactInfo('Path coverage RELAB table - unstratified', 'BIOM',
                     [(pb('pathcoverage_relab_unstratified.biom'), 'biom')]),
        ArtifactInfo('Path abundance RELAB table - unstratified', 'BIOM',
                     [(pb('pathabundance_relab_unstratified.biom'), 'biom')])
    ]

    return True, ainfo, ""
Exemplo n.º 15
0
    def test_woltka_to_array_wol(self):
        # inserting new prep template
        prep_info_dict = {
            'SKB8.640193': {
                'run_prefix': 'S22205_S104_L001_R1'
            },
            'SKD8.640184': {
                'run_prefix': 'S22282_S102_L001_R1'
            }
        }
        database = join(self.db_path, 'wol/WoLmin')

        pid, aid, job_id = self._helper_woltka_bowtie(prep_info_dict, database)

        out_dir = mkdtemp()
        self._clean_up_files.append(out_dir)

        # retriving info of the prep/artifact just created
        artifact_info = self.qclient.get("/qiita_db/artifacts/%s/" % aid)
        directory = {
            dirname(ffs)
            for _, fs in artifact_info['files'].items() for ffs in fs
        }
        directory = directory.pop()
        prep_info = artifact_info['prep_information']
        prep_info = self.qclient.get('/qiita_db/prep_template/%s/' %
                                     prep_info[0])
        prep_file = prep_info['prep-file']

        url = 'this-is-my-url'
        main_qsub_fp, merge_qsub_fp = woltka_to_array(directory, out_dir,
                                                      database, prep_file, url,
                                                      job_id)

        self.assertEqual(join(out_dir, f'{job_id}.qsub'), main_qsub_fp)
        self.assertEqual(join(out_dir, f'{job_id}.merge.qsub'), merge_qsub_fp)

        with open(main_qsub_fp) as f:
            main_qsub = f.readlines()
        with open(merge_qsub_fp) as f:
            merge_qsub = f.readlines()

        exp_main_qsub = [
            '#!/bin/bash\n', '#PBS -M [email protected]\n',
            f'#PBS -N {job_id}\n', '#PBS -l nodes=1:ppn=8\n',
            '#PBS -l walltime=10:00:00\n', '#PBS -l mem=64g\n',
            f'#PBS -o {out_dir}/{job_id}_'
            '${PBS_ARRAYID}.log\n', f'#PBS -e {out_dir}/{job_id}_'
            '${PBS_ARRAYID}.err\n', '#PBS -t 1-2%8\n', f'cd {out_dir}\n',
            f'{self.environment}\n', 'date\n', 'hostname\n',
            'offset=${PBS_ARRAYID}\n', 'step=$(( $offset - 0 ))\n',
            'if [[ $step -gt 2 ]]; then exit 0; fi\n',
            f'args0=$(head -n $step {out_dir}/{job_id}.array-details'
            ' | tail -n 1)\n',
            "infile0=$(echo -e $args0 | awk '{ print $1 }')\n",
            "outfile0=$(echo -e $args0 | awk '{ print $2 }')\n", 'set -e\n',
            'cat $infile0*.fastq.gz > $outfile0.fastq.gz; bowtie2 -p 8 -x '
            f'{database} -q $outfile0.fastq.gz -S $outfile0.sam --seed 42 '
            '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" '
            '--score-min "L,0,-0.05" --no-head --no-unal; woltka classify '
            '-i $outfile0.sam -o $outfile0.woltka-taxa --no-demux '
            f'--lineage {database}.tax --rank phylum,genus,species,free,none; '
            f'woltka classify -i $outfile0.sam -c {database}.coords '
            '-o $outfile0.woltka-per-gene --no-demux; xz -9 -T8 -c '
            '$outfile0.sam > $outfile0.xz\n', 'set +e\n', 'date\n'
        ]
        self.assertEqual(main_qsub, exp_main_qsub)

        exp_merge_qsub = [
            '#!/bin/bash\n', '#PBS -M [email protected]\n',
            f'#PBS -N merge-{job_id}\n', '#PBS -l nodes=1:ppn=6\n',
            '#PBS -l walltime=4:00:00\n', '#PBS -l mem=48g\n',
            f'#PBS -o {out_dir}/merge-{job_id}.log\n',
            f'#PBS -e {out_dir}/merge-{job_id}.err\n', f'cd {out_dir}\n',
            f'{self.environment}\n', 'date\n', 'hostname\n', 'set -e\n',
            f'woltka_merge --prep {prep_file} --base {out_dir}  --name '
            'phylum --glob "*.woltka-taxa/phylum.biom" &\n',
            f'woltka_merge --prep {prep_file} --base {out_dir}  --name '
            'genus --glob "*.woltka-taxa/genus.biom" &\n',
            f'woltka_merge --prep {prep_file} --base {out_dir}  --name '
            'species --glob "*.woltka-taxa/species.biom" &\n',
            f'woltka_merge --prep {prep_file} --base {out_dir}  --name '
            'free --glob "*.woltka-taxa/free.biom" &\n',
            f'woltka_merge --prep {prep_file} --base {out_dir}  --name '
            'none --glob "*.woltka-taxa/none.biom" &\n',
            f'woltka_merge --prep {prep_file} --base {out_dir}  --name '
            'per-gene --glob "*.woltka-per-gene" --rename &\n', 'wait\n',
            f'cd {out_dir}; tar -cvf alignment.tar *.sam.xz\n',
            f'finish_woltka {url} {job_id} {out_dir}\n', 'date\n'
        ]
        self.assertEqual(merge_qsub, exp_merge_qsub)

        # now let's test that if finished correctly
        sdir = 'qp_woltka/support_files/'
        copyfile(f'{sdir}/genus.biom', f'{out_dir}/genus.biom')
        copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
        copyfile(f'{sdir}/per-gene.biom', f'{out_dir}/per-gene.biom')
        copyfile(f'{sdir}/species.biom', f'{out_dir}/species.biom')
        copyfile(f'{sdir}/phylum.biom', f'{out_dir}/phylum.biom')
        copyfile(f'{sdir}/free.biom', f'{out_dir}/free.biom')
        copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')

        success, ainfo, msg = woltka(self.qclient, job_id, self.params,
                                     out_dir)

        self.assertEqual("", msg)
        self.assertTrue(success)

        exp = [
            ArtifactInfo('Alignment Profile', 'BIOM',
                         [(f'{out_dir}/free.biom', 'biom'),
                          (f'{out_dir}/alignment.tar', 'log')]),
            ArtifactInfo('Taxonomic Predictions - phylum', 'BIOM',
                         [(f'{out_dir}/phylum.biom', 'biom')]),
            ArtifactInfo('Taxonomic Predictions - genus', 'BIOM',
                         [(f'{out_dir}/genus.biom', 'biom')]),
            ArtifactInfo('Taxonomic Predictions - species', 'BIOM',
                         [(f'{out_dir}/species.biom', 'biom')]),
            ArtifactInfo('Per genome Predictions', 'BIOM',
                         [(f'{out_dir}/none.biom', 'biom')]),
            ArtifactInfo('Per gene Predictions', 'BIOM',
                         [(f'{out_dir}/per-gene.biom', 'biom')])
        ]

        self.assertCountEqual(ainfo, exp)

        # check that the produced table have feature taxonomy
        bt = load_table(f'{out_dir}/phylum.biom')
        self.assertCountEqual(
            bt.metadata_to_dataframe('observation').columns,
            ['taxonomy_0', 'taxonomy_1'])
Exemplo n.º 16
0
def _validate_per_sample_FASTQ(qclient, job_id, prep_info, files, test=False):
    """Validate and fix a new 'per_sample_FASTQ' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    test: bolean, optional
        If True this is being called by a test

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id,
                            "Step 2: Validating 'per_sample_FASTQ' files")

    samples = list(prep_info.keys())
    samples_count = len(samples)

    # Check if there is any filepath type that is not supported
    unsupported_fp_types = set(files) - {
        'raw_forward_seqs', 'raw_reverse_seqs', 'preprocessed_fastq'
    }
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact "
                     "type per_sample_FASTQ. Supported filepath types: "
                     "raw_forward_seqs, raw_reverse_seqs, preprocessed_fastq" %
                     ', '.join(unsupported_fp_types))
        return False, None, error_msg

    if 'raw_forward_seqs' in files:
        if 'preprocessed_fastq' in files:
            error_msg = ("If raw_forward_seqs is provided, preprocessed_fastq "
                         "should not be provided")
            return False, None, error_msg
        read_files = files['raw_forward_seqs']
        read_files_count = len(read_files)
        counts_match = read_files_count == samples_count
    elif 'preprocessed_fastq' in files:
        if 'raw_reverse_seqs' in files:
            error_msg = ("If preprocessed_fastq is provided, raw_reverse_seqs "
                         "should not be provided")
            return False, None, error_msg
        read_files = files['preprocessed_fastq']
        read_files_count = len(read_files)
        # In the preprocessed_fastq case, we either have 1 file per sample
        # or 4 files per sample
        counts_match = ((read_files_count == samples_count)
                        or (read_files_count == 4 * samples_count))
    else:
        error_msg = ("Missing required filepath type: raw_forward_seqs or "
                     "preprocessed_fastq")
        return False, None, error_msg

    # Make sure that we hve the same number of files than samples
    if 'raw_reverse_seqs' in files:
        rev_count = len(files['raw_reverse_seqs'])
        counts_match = counts_match and (rev_count == samples_count)
    else:
        rev_count = 0

    if not counts_match:
        error_msg = ("The number of provided files doesn't match the "
                     "number of samples (%d): %d raw_forward_seqs, "
                     "%d raw_reverse_seqs (optional, 0 is ok)" %
                     (samples_count, read_files_count, rev_count))
        return False, None, error_msg

    def _check_files(run_prefixes, read_files, rev_count, files):
        # Check that the provided files match the run prefixes
        fwd_fail = [
            basename(fp) for fp in read_files
            if not basename(fp).startswith(tuple(run_prefixes))
        ]
        if rev_count > 0:
            rev_fail = [
                basename(fp) for fp in files['raw_reverse_seqs']
                if not basename(fp).startswith(tuple(run_prefixes))
            ]
        else:
            rev_fail = []
        return fwd_fail, rev_fail

    # first let's check via sample sample_names
    run_prefixes = [sid.split('.', 1)[1] for sid in samples]
    fwd_fail, rev_fail = _check_files(run_prefixes, read_files, rev_count,
                                      files)

    # if that doesn't work, let's test via run_prefix
    run_prefix_present = 'run_prefix' in prep_info[samples[0]]
    if (fwd_fail or rev_fail) and run_prefix_present:
        run_prefixes = [v['run_prefix'] for k, v in prep_info.items()]
        if samples_count != len(set(run_prefixes)):
            repeated = [
                "%s (%d)" % (p, run_prefixes.count(p))
                for p in set(run_prefixes) if run_prefixes.count(p) > 1
            ]
            error_msg = ("The values for the column 'run_prefix' are not "
                         "unique for each sample. Repeated values: %s" %
                         ', '.join(repeated))
            return False, None, error_msg

        fwd_fail, rev_fail = _check_files(run_prefixes, read_files, rev_count,
                                          files)

    if fwd_fail or rev_fail:
        error_msg = "The provided files are not prefixed by sample id"
        if run_prefix_present:
            error_msg += (" or do not match the run prefix values in the "
                          "prep information.")
        else:
            error_msg += "."
        error_msg += (" Offending files:\n raw_forward_seqs: %s\n"
                      "raw_reverse_seqs: %s" %
                      (', '.join(fwd_fail), ', '.join(rev_fail)))
        return False, None, error_msg

    filepaths = []
    empty_files = []
    for fps_type, fps in files.items():
        for fp in fps:
            try:
                fp_size = getsize(fp)
            except OSError:
                fp_size = 0
            # 62 is the size of a gzip empty files that we generate
            if fp_size <= 100:
                empty_files.append(basename(fp))

            if fps_type in MUST_GZ:
                fp, error_msg = _gzip_file(fp, test)
                if error_msg is not None:
                    return False, None, error_msg

            filepaths.append((fp, fps_type))

    if empty_files:
        error_msg = "Some of the files are empty: %s" % ', '.join(empty_files)
        return False, None, error_msg

    return True, [ArtifactInfo(None, 'per_sample_FASTQ', filepaths)], ""
Exemplo n.º 17
0
def shogun(qclient, job_id, parameters, out_dir):
    """Run Shogun with the given parameters

    Parameters
    ----------
    qclient : tgp.qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run split libraries
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list, str
        The results of the job
    """
    # Step 1 get the rest of the information need to run Atropos
    qclient.update_job_step(job_id, "Step 1 of 5: Collecting information")
    artifact_id = parameters['input']
    del parameters['input']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Get the artifact metadata
    prep_info = qclient.get('/qiita_db/prep_template/%s/' %
                            artifact_info['prep_information'][0])
    qiime_map = prep_info['qiime-map']

    # Step 2 converting to fna
    qclient.update_job_step(job_id,
                            "Step 2 of 5: Converting to FNA for Shogun")

    with TemporaryDirectory(dir=out_dir, prefix='shogun_') as temp_dir:
        rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else []
        samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs,
                                             qiime_map)

        # Combining files
        comb_fp = generate_fna_file(temp_dir, samples)

        # Formatting parameters
        parameters = _format_params(parameters, SHOGUN_PARAMS)

        # Step 3 align
        align_cmd = generate_shogun_align_commands(comb_fp, temp_dir,
                                                   parameters)
        sys_msg = "Step 3 of 5: Aligning FNA with Shogun (%d/{0})".format(
            len(align_cmd))
        success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg,
                                     'Shogun Align')

        if not success:
            return False, None, msg

        # Step 4 taxonomic profile
        sys_msg = "Step 4 of 5: Taxonomic profile with Shogun (%d/{0})"
        assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands(
            temp_dir, parameters)
        success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg,
                                     'Shogun taxonomy assignment')
        if not success:
            return False, None, msg

        sys_msg = "Step 5 of 5: Converting output to BIOM"
        qclient.update_job_step(job_id, msg)
        output = run_shogun_to_biom(profile_fp, [None, None, None, True],
                                    out_dir, 'profile')

        ainfo = [
            ArtifactInfo('Shogun Alignment Profile', 'BIOM',
                         [(output, 'biom')])
        ]

    return True, ainfo, ""
Exemplo n.º 18
0
def _validate_multiple(qclient, job_id, prep_info, files, atype, test=False):
    """Validate and fix a new 'SFF', 'FASTQ', 'FASTA' or 'FASTA_Sanger' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    atype: str
        The type of the artifact
    test: bolean, optional
        If True this is being called by a test

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating '%s' files" % atype)
    req_fp_types, opt_fp_types = FILEPATH_TYPE_DICT[atype]
    all_fp_types = req_fp_types | opt_fp_types

    # Check if there is any filepath type that is not supported
    unsupported_fp_types = set(files) - all_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact "
                     "type %s. Supported filepath types: %s" %
                     (', '.join(unsupported_fp_types), atype, ', '.join(
                         sorted(all_fp_types))))
        return False, None, error_msg

    # Check if the run_prefix column is present in the prep info
    offending = {}
    types_seen = set()
    if 'run_prefix' in prep_info[next(iter(prep_info))]:
        # We can potentially have more than one lane in the prep information
        # so check that the provided files are prefixed with the values in
        # the run_prefix column
        run_prefixes = set(v['run_prefix'] for k, v in prep_info.items())
        num_prefixes = len(run_prefixes)

        # Check those filepath types that are required
        for ftype, t_files in files.items():
            # SFF is an special case cause we can have multiple files with
            # the same prefix
            if num_prefixes != len(t_files) and atype != 'SFF':
                offending[ftype] = (
                    "The number of provided files (%d) doesn't match the "
                    "number of run prefix values in the prep info (%d): %s" %
                    (len(t_files), num_prefixes, ', '.join(
                        basename(f) for f in t_files)))
            else:
                rps = []
                fps = []
                for fp in t_files:
                    bn = basename(fp)
                    found = [rp for rp in run_prefixes if bn.startswith(rp)]
                    if found:
                        rps.extend(found)
                    else:
                        fps.append(bn)
                if fps:
                    offending[ftype] = (
                        "The provided files do not match the run prefix "
                        "values in the prep information: %s" % ', '.join(fps))
                else:
                    rps = run_prefixes - set(rps)
                    if rps:
                        offending[ftype] = (
                            "The following run prefixes in the prep "
                            "information file do not match any file: %s" %
                            ', '.join(rps))

            types_seen.add(ftype)
    else:
        # If the run prefix column is not provided, we only allow a single
        # lane, so check that we have a single file for each provided
        # filepath type
        for ftype, t_files in files.items():
            if len(t_files) != 1:
                offending[ftype] = (
                    "Only one file per type is allowed. Please provide the "
                    "column 'run_prefix' if you need more than one file per "
                    "type: %s" % ', '.join(basename(fp) for fp in t_files))

            types_seen.add(ftype)

    # Check that all required filepath types where present
    missing = req_fp_types - types_seen
    if missing:
        error_msg = ("Missing required filepath type(s): %s" %
                     ', '.join(missing))
        return False, None, error_msg

    # Check if there was any offending file
    if offending:
        error_list = ["%s: %s" % (k, v) for k, v in offending.items()]
        error_msg = ("Error creating artifact. Offending files:\n%s" %
                     '\n'.join(error_list))
        return False, None, error_msg

    # Everything is ok
    filepaths = []
    for fps_type, fps in files.items():
        for fp in fps:
            if fps_type in MUST_GZ:
                fp, error_msg = _gzip_file(fp, test)
                if error_msg is not None:
                    return False, None, error_msg
            filepaths.append((fp, fps_type))

    # let's count sequences; this is basically the last check
    errors = []
    artifact_information = []
    if atype not in FILEPATH_TYPE_NO_FQTOOLS:
        for fp, fpt in filepaths:
            cmd = f'fqtools count {fp}'
            std_out, std_err, return_value = system_call(cmd)
            fn = basename(fp)
            if std_err or return_value != 0:
                errors.append(f'{fn}: {std_err}')
            else:
                reads = int(std_out)
                artifact_information.append({
                    'filename': fn,
                    'reads': reads,
                    'file_type': fpt
                })

        if errors:
            raise ValueError('Found errors: \n %s' % ''.join(errors))
        dname = dirname(fp)
        pd.DataFrame(artifact_information).to_csv(
            f'{dname}/qtp-sequencing-validate-data.csv', index=False)

    return True, [ArtifactInfo(None, atype, filepaths)], ""
Exemplo n.º 19
0
def _validate_per_sample_FASTQ(qclient, job_id, prep_info, files):
    """Validate and fix a new 'per_sample_FASTQ' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id,
                            "Step 2: Validating 'per_sample_FASTQ' files")

    samples = prep_info.keys()
    samples_count = len(samples)

    # Check if there is any filepath type that is not supported
    unsupported_fp_types = set(files) - {
        'raw_forward_seqs', 'raw_reverse_seqs', 'preprocessed_fastq'
    }
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact "
                     "type per_sample_FASTQ. Supported filepath types: "
                     "raw_forward_seqs, raw_reverse_seqs, preprocessed_fastq" %
                     ', '.join(unsupported_fp_types))
        return False, None, error_msg

    if 'raw_forward_seqs' in files:
        if 'preprocessed_fastq' in files:
            error_msg = ("If raw_forward_seqs is provided, preprocessed_fastq "
                         "should not be provided")
            return False, None, error_msg
        read_files = files['raw_forward_seqs']
        read_files_count = len(read_files)
        counts_match = read_files_count == samples_count
    elif 'preprocessed_fastq' in files:
        if 'raw_reverse_seqs' in files:
            error_msg = ("If preprocessed_fastq is provided, raw_reverse_seqs "
                         "should not be provided")
            return False, None, error_msg
        read_files = files['preprocessed_fastq']
        read_files_count = len(read_files)
        # In the preprocessed_fastq case, we either have 1 file per sample
        # or 4 files per sample
        counts_match = ((read_files_count == samples_count)
                        or (read_files_count == 4 * samples_count))
    else:
        error_msg = ("Missing required filepath type: raw_forward_seqs or "
                     "preprocessed_fastq")
        return False, None, error_msg

    # Make sure that we hve the same number of files than samples
    if 'raw_reverse_seqs' in files:
        rev_count = len(files['raw_reverse_seqs'])
        counts_match = counts_match and (rev_count == samples_count)
    else:
        rev_count = 0

    if not counts_match:
        error_msg = ("The number of provided files doesn't match the "
                     "number of samples (%d): %d raw_forward_seqs, "
                     "%d raw_reverse_seqs (optional, 0 is ok)" %
                     (samples_count, read_files_count, rev_count))
        return False, None, error_msg

    if 'run_prefix' in prep_info[samples[0]]:
        # The column 'run_prefix' is present in the prep information.
        # Make sure that twe have the same number of run_prefix values
        # than the number of samples
        run_prefixes = [v['run_prefix'] for k, v in prep_info.items()]
        if samples_count != len(set(run_prefixes)):
            repeated = [
                "%s (%d)" % (p, run_prefixes.count(p))
                for p in set(run_prefixes) if run_prefixes.count(p) > 1
            ]
            error_msg = ("The values for the column 'run_prefix' are not "
                         "unique for each sample. Repeated values: %s" %
                         ', '.join(repeated))
            return False, None, error_msg

        error_msg = ("The provided files do not match the run prefix values "
                     "in the prep information. Offending files: "
                     "raw_forward_seqs: %s, raw_reverse_seqs: %s")
    else:
        # The column 'run_prefix' is not in the prep template. In this case,
        # check that the files are prefixed by the sample ids without the
        # study id
        run_prefixes = [sid.split('.', 1)[1] for sid in samples]
        error_msg = ("The provided files are not prefixed by sample id. "
                     "Please provide the 'run_prefix' column in your prep "
                     "information. Offending files: raw_forward_seqs: %s, "
                     "raw_reverse_seqs: %s")

    # Check that the provided files match the run prefixes
    fwd_fail = [
        basename(fp) for fp in read_files
        if not basename(fp).startswith(tuple(run_prefixes))
    ]
    if rev_count > 0:
        rev_fail = [
            basename(fp) for fp in files['raw_reverse_seqs']
            if not basename(fp).startswith(tuple(run_prefixes))
        ]
    else:
        rev_fail = []

    if fwd_fail or rev_fail:
        error_msg = error_msg % (', '.join(fwd_fail), ', '.join(rev_fail))
        return False, None, error_msg

    filepaths = []
    for fps_type, fps in files.items():
        filepaths.extend([(fp, fps_type) for fp in fps])
    return True, [ArtifactInfo(None, 'per_sample_FASTQ', filepaths)], ""