Exemplo n.º 1
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n",
               b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n",
               b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n"]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Exemplo n.º 2
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)
            f.flush()
            f.close()
            to_hdf5(f.name, self.hdf5_file)
            self.to_remove.append(f.name)

        exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n"),
               (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n"),
               (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n")]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Exemplo n.º 3
0
    def test_to_ascii_fasta(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n",
               b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n",
               b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n",
               b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n",
               b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Exemplo n.º 4
0
    def test_to_ascii_fasta(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [
            b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n",
            b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n",
            b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n",
            b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n",
            b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"
        ]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Exemplo n.º 5
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)
            f.flush()
            f.close()
            to_hdf5(f.name, self.hdf5_file)
            self.to_remove.append(f.name)

        exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                "A\x00\x00\x00\x00\x00\x00\x00"
                "B\x00\x00\x00\x00\x00\x00\x00"
                "C\x00\x00\x00\x00\x00\x00\x00\n"),
               (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                "D\x00\x00\x00\x00\x00\x00\x00"
                "F\x00\x00\x00\x00\x00\x00\x00"
                "G\x00\x00\x00\x00\x00\x00\x00\n"),
               (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                "D\x00\x00\x00\x00\x00\x00\x00"
                "E\x00\x00\x00\x00\x00\x00\x00"
                "F\x00\x00\x00\x00\x00\x00\x00\n")]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Exemplo n.º 6
0
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp,
                         fastq_fp=None, fasta_fp=None, log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with File(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                return format_payload(
                    success=False,
                    error_msg='The sample ids in the "run_prefix" columns '
                              'from the prep information do not match the '
                              'ones in the demux file. Please, correct the '
                              'column "run_prefix" in the prep information to '
                              'map the existing sample ids to the prep '
                              'information sample ids.')
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                return format_payload(
                    success=False,
                    error_msg='The sample ids in the demultiplexed files do '
                              'not match the ones in the prep information. '
                              'Please, provide the column "run_prefix" in '
                              'the prep information to map the existing sample'
                              ' ids to the prep information sample ids.')
        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        copy(demux_fp, new_demux_fp)
        # Need to catch an error
        with File(new_demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])

        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        demux_fp = new_demux_fp
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        with open(fastq_fp, 'w') as fq:
            with File(demux_fp, 'r') as dx:
                for record in to_ascii(dx):
                    fq.write(record)

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        with open(fasta_fp, 'w') as f:
            for r in load(fastq_fp):
                f.write(format_fasta_record(r['SequenceID'], r['Sequence'],
                                            r['Qual']))

    filepaths = [[[fastq_fp], 'preprocessed_fastq'],
                 [[fasta_fp], 'preprocessed_fasta'],
                 [[demux_fp], 'preprocessed_demux']]
    if log_fp:
        filepaths.append([[log_fp], 'log'])
    return format_payload(
        success=True, artifacts_info=[[None, 'Demultiplexed', filepaths]])
Exemplo n.º 7
0
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp,
                         fastq_fp=None, fasta_fp=None, log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with File(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                error_msg = ('The sample ids in the "run_prefix" columns '
                             'from the prep information do not match the '
                             'ones in the demux file. Please, correct the '
                             'column "run_prefix" in the prep information to '
                             'map the existing sample ids to the prep '
                             'information sample ids.')
                return False, None, error_msg
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the demultiplexed files do '
                             'not match the ones in the prep information. '
                             'Please, provide the column "run_prefix" in '
                             'the prep information to map the existing sample'
                             ' ids to the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        copy(demux_fp, new_demux_fp)
        # Need to catch an error
        with File(new_demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])

        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        demux_fp = new_demux_fp
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        with open(fastq_fp, 'w') as fq:
            with File(demux_fp, 'r') as dx:
                for record in to_ascii(dx):
                    fq.write(record)

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        with open(fasta_fp, 'w') as f:
            for r in load(fastq_fp):
                f.write(format_fasta_record(r['SequenceID'], r['Sequence'],
                                            r['Qual']))

    filepaths = [[[fastq_fp], 'preprocessed_fastq'],
                 [[fasta_fp], 'preprocessed_fasta'],
                 [[demux_fp], 'preprocessed_demux']]
    if log_fp:
        filepaths.append([[log_fp], 'log'])
    return True, [[None, 'Demultiplexed', filepaths]], ""