Exemplo n.º 1
0
def hisat2_fq(tmp_dir, sample, fq_file_name, description):
    """
    Count number of reads in the FASTQ file output by ``hisat2``.

    ``<tmp_dir>/<sample>`` is searched for a FASTQ file matching
    ``fq_file_name``. The number of reads in the file are counted.

    A ``pandas.core.frame.Series`` is created with fields
    ``SampleName``, ``Program``, ``File``, ``NumReads``,
    ``Description``.

    :param tmp_dir: Directory
    :type tmp_dir: str or unicode
    :param sample: Sample name
    :type sample: str or unicode
    :param fq_file_name: FASTQ file name pattern
    :type fq_file_name: str or unicode
    :param description: Description of this step
    :type description: str or unicode
    :return: ``pandas.core.frame.Series``, or ``None``
    :rtype: pandas.core.frame.Series
    """
    fq_files = glob.glob(os.path.join(tmp_dir, sample, fq_file_name))
    if not fq_files:
        return None
    fq_file = fq_files[0]  # Only 1 match expected
    print(fq_file)
    try:
        num_reads = fastq.count_sequences(fq_file)
    except Exception as e:
        print(e)
        return None
    row = pd.DataFrame([[sample, "hisat2", fq_file, num_reads, description]],
                       columns=HEADER)
    return row
Exemplo n.º 2
0
def _input_fq_count(sample_name, file_name):
    """
    Extract names of FASTQ input files from workflow configuration
    file and count the number of reads in each file.

    ``sample_name`` is a sample name e.g. "JEC21" and ``file_name``
    is the path to file.

    :param sample_name: sample name
    :type sample_name: str or unicode
    :param file_name: path to file
    :type file_name: str or unicode
    :return: ``pandas.core.frame.Series``, or ``None``
    :rtype: pandas.core.frame.Series
    """
    try:
        num_reads = fastq.count_sequences(file_name)
        return pd.DataFrame(
            [[sample_name, INPUT, file_name, num_reads, INPUT]],
            columns=HEADER)
    except Exception as e:
        print(e)
        # Return None so that this sample/file combination will be
        # treated as invalid in later processing.
        return None
Exemplo n.º 3
0
def input_fq(config_file, input_dir):
    """
    Extract names of FASTQ input files from workflow configuration
    file and count the number of reads in each file.

    The configuration file is checked to see if it has an ``fq_files``
    key whose value is mappings from sample names to sample files
    (relative to ``input_dir``). Each FASTQ file has its reads
    counted.

    If there is no ``fq_files`` key but there is a
    ``multiplex_fq_files`` key then the value of this key is assumed
    to be a list of multiplexed input files (relative to
    ``input_dir``). Each FASTQ file has its reads counted.

    If both keys exist then both sets of input files are traversed.

    If neither key exists then no input files are traversed.

    For each file a ``pandas.core.frame.Series`` is created with
    fields ``SampleName`` (sample name recorded in configuration or,
    for multiplexed files, ``''``), ``Program`` (set to ``input``),
    ``File``, ``NumReads``, ``Description`` (``input``).

    :param config_file: Configuration file
    :type config_file: str or unicode
    :param input_dir: Directory
    :type input_dir: str or unicode
    :return: list of ``pandas.core.frame.Series``, or ``[]``
    :rtype: list(pandas.core.frame.Series)
    """
    with open(config_file, 'r') as f:
        config = yaml.load(f, yaml.SafeLoader)
    rows = []
    if utils.value_in_dict(params.FQ_FILES, config):
        sample_files = [
            (sample_name, os.path.join(input_dir, file_name))
            for sample_name, file_name in list(config[params.FQ_FILES].items())
        ]
    else:
        sample_files = []
    if utils.value_in_dict(params.MULTIPLEX_FQ_FILES, config):
        multiplex_files = [("", os.path.join(input_dir, file_name))
                           for file_name in config[params.MULTIPLEX_FQ_FILES]]
    else:
        multiplex_files = []
    files = sample_files + multiplex_files
    for (sample_name, file_name) in files:
        print(file_name)
        try:
            num_reads = fastq.count_sequences(file_name)
            row = pd.DataFrame(
                [[sample_name, INPUT, file_name, num_reads, INPUT]],
                columns=HEADER)
            rows.append(row)
        except Exception as e:
            print(e)
            continue
    return rows
Exemplo n.º 4
0
def test_count_sequences(tmp_file, count):
    """
    Test :py:func:`riboviz.fastq.count_sequences`. with FASTQ files.

    :param tmp_file: path to temporary file
    :type tmp_file: str or unicode
    :param count: Number of sequences
    :type count: int
    """
    sequences = get_test_fastq_sequences(4, count)
    with open(tmp_file, "wt") as f:
        SeqIO.write(sequences, f, "fastq")
    assert fastq.count_sequences(tmp_file) == count
Exemplo n.º 5
0
def cutadapt_fq(tmp_dir, sample=""):
    """
    Count number of reads in the FASTQ file output by ``cutadapt``.

    ``<tmp_dir>/<sample>`` is searched for a FASTQ file matching
    :py:const:`riboviz.workflow_files.ADAPTER_TRIM_FQ`. Any file
    also matching :py:const:`riboviz.workflow_files.UMI_EXTRACT_FQ`
    is then removed (these file names overlap). The number of reads in
    the resulting file are counted.

    A ``pandas.core.frame.Series`` is created with fields
    ``SampleName``, ``Program``, ``File``, ``NumReads``,
    ``Description``.

    :param tmp_dir: Directory
    :type tmp_dir: str or unicode
    :param sample: Sample name
    :type sample: str or unicode
    :return: ``pandas.core.frame.Series``, or ``None``
    :rtype: pandas.core.frame.Series
    """
    fq_files = glob.glob(
        os.path.join(tmp_dir, sample, "*" + workflow_files.ADAPTER_TRIM_FQ))
    # If using with FASTQ files then there may be a
    # file with extension "_extract_trim.fq" which also will be
    # caught by the glob above, so remove this file name.
    umi_files = glob.glob(
        os.path.join(tmp_dir, sample, "*" + workflow_files.UMI_EXTRACT_FQ))
    fq_files = [
        file_name for file_name in fq_files if file_name not in umi_files
    ]
    if not fq_files:
        return None
    fq_file = fq_files[0]  # Only 1 match expected.
    print(fq_file)
    try:
        num_reads = fastq.count_sequences(fq_file)
    except Exception as e:
        print(e)
        return None
    description = "Reads after removal of sequencing library adapters"
    row = pd.DataFrame([[sample, "cutadapt", fq_file, num_reads, description]],
                       columns=HEADER)
    return row
Exemplo n.º 6
0
def umi_tools_deplex_fq(tmp_dir):
    """
    Count number of reads in the FASTQ files output by
    :py:mod:`riboviz.tools.demultiplex_fastq`.

    ``tmp_dir`` is searched for directories matching
    :py:const:`riboviz.workflow_files.DEPLEX_DIR_FORMAT`.
    Each of these directories is traversed to identify FASTQ
    files. Each of these directories is also traversed to identify TSV
    files matching
    :py:const:`riboviz.demultiplex_fastq.NUM_READS_FILE`.

    If, for a directory, the TSV file exists it is parsed and the
    number of reads in each FASTQ file extracted. If the TSV file
    cannot be found then the number of reads in the FASTQ files
    themselves are counted.

    For each file a ``pandas.core.frame.Series`` is created with
    fields ``SampleName``, ``Program``, ``File``, ``NumReads``,
    ``Description``.

    :param tmp_dir: Directory
    :type tmp_dir: str or unicode
    :return: list of ``pandas.core.frame.Series``, or ``[]``
    :rtype: list(pandas.core.frame.Series)
    """
    deplex_dirs = glob.glob(
        os.path.join(tmp_dir, workflow_files.DEPLEX_DIR_FORMAT.format("*")))
    if not deplex_dirs:
        return []
    description = "Demultiplexed reads"
    rows = []
    for deplex_dir in deplex_dirs:
        fq_files = [
            glob.glob(os.path.join(deplex_dir, "*" + ext))
            for ext in fastq.FASTQ_EXTS
        ]
        # Flatten
        fq_files = [f for files in fq_files for f in files]
        if not fq_files:
            continue
        fq_files.sort()
        tsv_files = glob.glob(
            os.path.join(deplex_dir, demultiplex_fastq.NUM_READS_FILE))
        is_tsv_problem = False
        if tsv_files:
            num_reads_file = tsv_files[0]
            print(num_reads_file)
            try:
                deplex_df = pd.read_csv(num_reads_file,
                                        delimiter="\t",
                                        comment="#")
                for fq_file in fq_files:
                    tag = os.path.basename(fq_file).split(".")[0]
                    tag_df = deplex_df[deplex_df[sample_sheets.SAMPLE_ID] ==
                                       tag]
                    num_reads = tag_df.iloc[0][sample_sheets.NUM_READS]
                    row = pd.DataFrame([[
                        tag, demultiplex_fastq_tools_module.__name__, fq_file,
                        num_reads, description
                    ]],
                                       columns=HEADER)
                    rows.append(row)
            except Exception as e:
                print(e)
                is_tsv_problem = True
        if is_tsv_problem or not tsv_files:
            # Traverse FASTQ files directly.
            for fq_file in fq_files:
                print(fq_file)
                tag = os.path.basename(fq_file).split(".")[0]
                try:
                    num_reads = fastq.count_sequences(fq_file)
                except Exception as e:
                    print(e)
                    continue
                row = pd.DataFrame([[
                    tag, demultiplex_fastq_tools_module.__name__, fq_file,
                    num_reads, description
                ]],
                                   columns=HEADER)
                rows.append(row)
    return rows