def hisat2_fq(tmp_dir, sample, fq_file_name, description): """ Count number of reads in the FASTQ file output by ``hisat2``. ``<tmp_dir>/<sample>`` is searched for a FASTQ file matching ``fq_file_name``. The number of reads in the file are counted. A ``pandas.core.frame.Series`` is created with fields ``SampleName``, ``Program``, ``File``, ``NumReads``, ``Description``. :param tmp_dir: Directory :type tmp_dir: str or unicode :param sample: Sample name :type sample: str or unicode :param fq_file_name: FASTQ file name pattern :type fq_file_name: str or unicode :param description: Description of this step :type description: str or unicode :return: ``pandas.core.frame.Series``, or ``None`` :rtype: pandas.core.frame.Series """ fq_files = glob.glob(os.path.join(tmp_dir, sample, fq_file_name)) if not fq_files: return None fq_file = fq_files[0] # Only 1 match expected print(fq_file) try: num_reads = fastq.count_sequences(fq_file) except Exception as e: print(e) return None row = pd.DataFrame([[sample, "hisat2", fq_file, num_reads, description]], columns=HEADER) return row
def _input_fq_count(sample_name, file_name): """ Extract names of FASTQ input files from workflow configuration file and count the number of reads in each file. ``sample_name`` is a sample name e.g. "JEC21" and ``file_name`` is the path to file. :param sample_name: sample name :type sample_name: str or unicode :param file_name: path to file :type file_name: str or unicode :return: ``pandas.core.frame.Series``, or ``None`` :rtype: pandas.core.frame.Series """ try: num_reads = fastq.count_sequences(file_name) return pd.DataFrame( [[sample_name, INPUT, file_name, num_reads, INPUT]], columns=HEADER) except Exception as e: print(e) # Return None so that this sample/file combination will be # treated as invalid in later processing. return None
def input_fq(config_file, input_dir): """ Extract names of FASTQ input files from workflow configuration file and count the number of reads in each file. The configuration file is checked to see if it has an ``fq_files`` key whose value is mappings from sample names to sample files (relative to ``input_dir``). Each FASTQ file has its reads counted. If there is no ``fq_files`` key but there is a ``multiplex_fq_files`` key then the value of this key is assumed to be a list of multiplexed input files (relative to ``input_dir``). Each FASTQ file has its reads counted. If both keys exist then both sets of input files are traversed. If neither key exists then no input files are traversed. For each file a ``pandas.core.frame.Series`` is created with fields ``SampleName`` (sample name recorded in configuration or, for multiplexed files, ``''``), ``Program`` (set to ``input``), ``File``, ``NumReads``, ``Description`` (``input``). :param config_file: Configuration file :type config_file: str or unicode :param input_dir: Directory :type input_dir: str or unicode :return: list of ``pandas.core.frame.Series``, or ``[]`` :rtype: list(pandas.core.frame.Series) """ with open(config_file, 'r') as f: config = yaml.load(f, yaml.SafeLoader) rows = [] if utils.value_in_dict(params.FQ_FILES, config): sample_files = [ (sample_name, os.path.join(input_dir, file_name)) for sample_name, file_name in list(config[params.FQ_FILES].items()) ] else: sample_files = [] if utils.value_in_dict(params.MULTIPLEX_FQ_FILES, config): multiplex_files = [("", os.path.join(input_dir, file_name)) for file_name in config[params.MULTIPLEX_FQ_FILES]] else: multiplex_files = [] files = sample_files + multiplex_files for (sample_name, file_name) in files: print(file_name) try: num_reads = fastq.count_sequences(file_name) row = pd.DataFrame( [[sample_name, INPUT, file_name, num_reads, INPUT]], columns=HEADER) rows.append(row) except Exception as e: print(e) continue return rows
def test_count_sequences(tmp_file, count): """ Test :py:func:`riboviz.fastq.count_sequences`. with FASTQ files. :param tmp_file: path to temporary file :type tmp_file: str or unicode :param count: Number of sequences :type count: int """ sequences = get_test_fastq_sequences(4, count) with open(tmp_file, "wt") as f: SeqIO.write(sequences, f, "fastq") assert fastq.count_sequences(tmp_file) == count
def cutadapt_fq(tmp_dir, sample=""): """ Count number of reads in the FASTQ file output by ``cutadapt``. ``<tmp_dir>/<sample>`` is searched for a FASTQ file matching :py:const:`riboviz.workflow_files.ADAPTER_TRIM_FQ`. Any file also matching :py:const:`riboviz.workflow_files.UMI_EXTRACT_FQ` is then removed (these file names overlap). The number of reads in the resulting file are counted. A ``pandas.core.frame.Series`` is created with fields ``SampleName``, ``Program``, ``File``, ``NumReads``, ``Description``. :param tmp_dir: Directory :type tmp_dir: str or unicode :param sample: Sample name :type sample: str or unicode :return: ``pandas.core.frame.Series``, or ``None`` :rtype: pandas.core.frame.Series """ fq_files = glob.glob( os.path.join(tmp_dir, sample, "*" + workflow_files.ADAPTER_TRIM_FQ)) # If using with FASTQ files then there may be a # file with extension "_extract_trim.fq" which also will be # caught by the glob above, so remove this file name. umi_files = glob.glob( os.path.join(tmp_dir, sample, "*" + workflow_files.UMI_EXTRACT_FQ)) fq_files = [ file_name for file_name in fq_files if file_name not in umi_files ] if not fq_files: return None fq_file = fq_files[0] # Only 1 match expected. print(fq_file) try: num_reads = fastq.count_sequences(fq_file) except Exception as e: print(e) return None description = "Reads after removal of sequencing library adapters" row = pd.DataFrame([[sample, "cutadapt", fq_file, num_reads, description]], columns=HEADER) return row
def umi_tools_deplex_fq(tmp_dir): """ Count number of reads in the FASTQ files output by :py:mod:`riboviz.tools.demultiplex_fastq`. ``tmp_dir`` is searched for directories matching :py:const:`riboviz.workflow_files.DEPLEX_DIR_FORMAT`. Each of these directories is traversed to identify FASTQ files. Each of these directories is also traversed to identify TSV files matching :py:const:`riboviz.demultiplex_fastq.NUM_READS_FILE`. If, for a directory, the TSV file exists it is parsed and the number of reads in each FASTQ file extracted. If the TSV file cannot be found then the number of reads in the FASTQ files themselves are counted. For each file a ``pandas.core.frame.Series`` is created with fields ``SampleName``, ``Program``, ``File``, ``NumReads``, ``Description``. :param tmp_dir: Directory :type tmp_dir: str or unicode :return: list of ``pandas.core.frame.Series``, or ``[]`` :rtype: list(pandas.core.frame.Series) """ deplex_dirs = glob.glob( os.path.join(tmp_dir, workflow_files.DEPLEX_DIR_FORMAT.format("*"))) if not deplex_dirs: return [] description = "Demultiplexed reads" rows = [] for deplex_dir in deplex_dirs: fq_files = [ glob.glob(os.path.join(deplex_dir, "*" + ext)) for ext in fastq.FASTQ_EXTS ] # Flatten fq_files = [f for files in fq_files for f in files] if not fq_files: continue fq_files.sort() tsv_files = glob.glob( os.path.join(deplex_dir, demultiplex_fastq.NUM_READS_FILE)) is_tsv_problem = False if tsv_files: num_reads_file = tsv_files[0] print(num_reads_file) try: deplex_df = pd.read_csv(num_reads_file, delimiter="\t", comment="#") for fq_file in fq_files: tag = os.path.basename(fq_file).split(".")[0] tag_df = deplex_df[deplex_df[sample_sheets.SAMPLE_ID] == tag] num_reads = tag_df.iloc[0][sample_sheets.NUM_READS] row = pd.DataFrame([[ tag, demultiplex_fastq_tools_module.__name__, fq_file, num_reads, description ]], columns=HEADER) rows.append(row) except Exception as e: print(e) is_tsv_problem = True if is_tsv_problem or not tsv_files: # Traverse FASTQ files directly. for fq_file in fq_files: print(fq_file) tag = os.path.basename(fq_file).split(".")[0] try: num_reads = fastq.count_sequences(fq_file) except Exception as e: print(e) continue row = pd.DataFrame([[ tag, demultiplex_fastq_tools_module.__name__, fq_file, num_reads, description ]], columns=HEADER) rows.append(row) return rows