def finalize_result( original_sequences: SingleLanePerSampleSingleEndFastqDirFmt, result_sequences: SingleLanePerSampleSingleEndFastqDirFmt, stats_df: pd.DataFrame) -> SingleLanePerSampleSingleEndFastqDirFmt: print("in finalize result") demux = original_sequences result = result_sequences #exclude those sample with resulting zero sequences after filtering #since a fastq file with zero sequences is invalid sample_ids_to_include = list( stats_df[stats_df['n_seqs_kept'] > 0]['sample-id']) #exit with error here if len(sample_ids_to_include) == 0: raise ValueError( "All sequences from all samples were filtered out through abundance-filtering." ) #manifest manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') manifest_fh.write('# direction is not meaningful in this file as these\n') manifest_fh.write('# data may be derived from forward, reverse, or \n') manifest_fh.write('# joined reads\n') for sample_id in sample_ids_to_include: path = return_fastqgz_path_for_sample(demux, sample_id=sample_id) manifest_fh.write('{sample_id},{filename},{direction}\n'.format( sample_id=sample_id, filename=path.name, direction='forward')) manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) ###metadata demux_metadata_view = demux.metadata.view(YamlFormat) with open(str(demux_metadata_view)) as demux_metadata_fh: demux_metadata_dict = yaml.load(demux_metadata_fh) metadata = YamlFormat() metadata.path.write_text(yaml.dump(demux_metadata_dict)) result.metadata.write_data(metadata, YamlFormat) return result
def test_validate_negative(self): files = [ 'no-data-MANIFEST', 'not-MANIFEST', 'relative_manifests/jagged-MANIFEST' ] for file in files: filepath = self.get_data_path(file) with self.assertRaisesRegex(ValidationError, 'FastqManifestFormat'): FastqManifestFormat(filepath, mode='r').validate()
def test_validate_positive(self): for file in ['single-MANIFEST', 'paired-MANIFEST', 'long-MANIFEST']: filepath = self.get_data_path('relative_manifests/%s' % file) FastqManifestFormat(filepath, mode='r').validate()
def emp_paired( seqs: BarcodePairedSequenceFastqIterator, barcodes: qiime2.MetadataCategory, rev_comp_barcodes: bool = False, rev_comp_mapping_barcodes: bool = False ) -> SingleLanePerSamplePairedEndFastqDirFmt: result = SingleLanePerSamplePairedEndFastqDirFmt() barcode_map, barcode_len = _make_barcode_map(barcodes, rev_comp_mapping_barcodes) manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') per_sample_fastqs = {} for barcode_record, forward_record, reverse_record in seqs: barcode_read = barcode_record[1] if rev_comp_barcodes: barcode_read = str(skbio.DNA(barcode_read).reverse_complement()) barcode_read = barcode_read[:barcode_len] try: sample_id = barcode_map[barcode_read] except KeyError: # TODO: this should ultimately be logged, but we don't currently # have support for that. continue if sample_id not in per_sample_fastqs: barcode_id = len(per_sample_fastqs) + 1 fwd_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=1) rev_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=2) _maintain_open_fh_count(per_sample_fastqs, paired=True) per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'), gzip.open(str(rev_path), mode='a')) manifest_fh.write('%s,%s,%s\n' % (sample_id, fwd_path.name, 'forward')) manifest_fh.write('%s,%s,%s\n' % (sample_id, rev_path.name, 'reverse')) if per_sample_fastqs[sample_id][0].closed: _maintain_open_fh_count(per_sample_fastqs, paired=True) fwd, rev = per_sample_fastqs[sample_id] per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'), gzip.open(rev.name, mode='a')) fwd, rev = per_sample_fastqs[sample_id] fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8')) rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8')) if len(per_sample_fastqs) == 0: raise ValueError('No sequences were mapped to samples. Check that ' 'your barcodes are in the correct orientation (see ' 'the rev_comp_barcodes and/or ' 'rev_comp_mapping_barcodes options).') for fwd, rev in per_sample_fastqs.values(): fwd.close() rev.close() manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) _write_metadata_yaml(result) return result
def emp_single( seqs: BarcodeSequenceFastqIterator, barcodes: qiime2.MetadataCategory, rev_comp_barcodes: bool = False, rev_comp_mapping_barcodes: bool = False ) -> SingleLanePerSampleSingleEndFastqDirFmt: result = SingleLanePerSampleSingleEndFastqDirFmt() barcode_map, barcode_len = _make_barcode_map(barcodes, rev_comp_mapping_barcodes) manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') manifest_fh.write('# direction is not meaningful in this file as these\n') manifest_fh.write('# data may be derived from forward, reverse, or \n') manifest_fh.write('# joined reads\n') per_sample_fastqs = {} for barcode_record, sequence_record in seqs: barcode_read = barcode_record[1] if rev_comp_barcodes: barcode_read = str(skbio.DNA(barcode_read).reverse_complement()) barcode_read = barcode_read[:barcode_len] try: sample_id = barcode_map[barcode_read] except KeyError: # TODO: this should ultimately be logged, but we don't currently # have support for that. continue if sample_id not in per_sample_fastqs: # The barcode id, lane number and read number are not relevant # here. We might ultimately want to use a dir format other than # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care # about this information. Similarly, the direction of the read # isn't relevant here anymore. barcode_id = len(per_sample_fastqs) + 1 path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=1) _maintain_open_fh_count(per_sample_fastqs) per_sample_fastqs[sample_id] = gzip.open(str(path), mode='a') manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward')) if per_sample_fastqs[sample_id].closed: _maintain_open_fh_count(per_sample_fastqs) per_sample_fastqs[sample_id] = gzip.open( per_sample_fastqs[sample_id].name, mode='a') fastq_lines = '\n'.join(sequence_record) + '\n' fastq_lines = fastq_lines.encode('utf-8') per_sample_fastqs[sample_id].write(fastq_lines) if len(per_sample_fastqs) == 0: raise ValueError('No sequences were mapped to samples. Check that ' 'your barcodes are in the correct orientation (see ' 'the rev_comp_barcodes and/or ' 'rev_comp_mapping_barcodes options).') for fh in per_sample_fastqs.values(): fh.close() manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) _write_metadata_yaml(result) return result
def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt, min_quality: int = _default_params['min_quality'], quality_window: int = _default_params['quality_window'], min_length_fraction: float = _default_params['min_length_fraction'], max_ambiguous: int = _default_params['max_ambiguous']) \ -> (SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame): result = SingleLanePerSampleSingleEndFastqDirFmt() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') manifest_fh.write('# direction is not meaningful in this file as these\n') manifest_fh.write('# data may be derived from forward, reverse, or \n') manifest_fh.write('# joined reads\n') log_records_truncated_counts = {} log_records_max_ambig_counts = {} log_records_tooshort_counts = {} log_records_totalread_counts = {} log_records_totalkept_counts = {} metadata_view = demux.metadata.view(YamlFormat).open() phred_offset = yaml.load(metadata_view)['phred-offset'] demux_manifest = demux.manifest.view(demux.manifest.format) demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str) demux_manifest.set_index('filename', inplace=True) iterator = demux.sequences.iter_views(FastqGzFormat) for bc_id, (fname, fp) in enumerate(iterator): sample_id = demux_manifest.loc[str(fname)]['sample-id'] log_records_truncated_counts[sample_id] = 0 log_records_max_ambig_counts[sample_id] = 0 log_records_tooshort_counts[sample_id] = 0 log_records_totalread_counts[sample_id] = 0 log_records_totalkept_counts[sample_id] = 0 # per q2-demux, barcode ID, lane number and read number are not # relevant here path = result.sequences.path_maker(sample_id=sample_id, barcode_id=bc_id, lane_number=1, read_number=1) # we do not open a writer by default in the event that all sequences # for a sample are filtered out; an empty fastq file is not a valid # fastq file. writer = None for sequence_record in _read_fastq_seqs(str(fp), phred_offset): log_records_totalread_counts[sample_id] += 1 # determine the length of the runs below quality threshold # NOTE: QIIME 1.x used <= the quality threshold and the parameter # -q was interpreted as the maximum unacceptable PHRED score. In # QIIME 2.x, we're now interpreting this as the minimum # acceptable score. qual_below_threshold = sequence_record[4] < min_quality run_starts, run_lengths = _runs_of_ones(qual_below_threshold) bad_windows = np.argwhere(run_lengths > quality_window) # if there is a run of sufficient size, truncate it if bad_windows.size > 0: log_records_truncated_counts[sample_id] += 1 full_length = len(sequence_record[1]) sequence_record = _truncate(sequence_record, run_starts[bad_windows[0]][0]) trunc_length = len(sequence_record[1]) # do not keep the read if it is too short following truncation if round(trunc_length / full_length, 3) <= min_length_fraction: log_records_tooshort_counts[sample_id] += 1 continue # do not keep the read if there are too many ambiguous bases if sequence_record[1].count('N') > max_ambiguous: log_records_max_ambig_counts[sample_id] += 1 continue fastq_lines = '\n'.join(sequence_record[:4]) + '\n' fastq_lines = fastq_lines.encode('utf-8') if writer is None: writer = gzip.open(str(path), mode='w') writer.write(fastq_lines) log_records_totalkept_counts[sample_id] += 1 if writer is not None: manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward')) writer.close() if set(log_records_totalkept_counts.values()) == {0, }: raise ValueError("All sequences from all samples were filtered out. " "The parameter choices may be too stringent for the " "data.") manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) metadata = YamlFormat() metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] stats = [] for id_, _ in sorted(log_records_truncated_counts.items()): stats.append([id_, log_records_totalread_counts[id_], log_records_totalkept_counts[id_], log_records_truncated_counts[id_], log_records_tooshort_counts[id_], log_records_max_ambig_counts[id_]]) stats = pd.DataFrame(stats, columns=columns).set_index('sample-id') return result, stats
def emp_paired( seqs: BarcodePairedSequenceFastqIterator, barcodes: qiime2.CategoricalMetadataColumn, golay_error_correction: bool = True, rev_comp_barcodes: bool = False, rev_comp_mapping_barcodes: bool = False, ignore_description_mismatch: bool = False ) -> (SingleLanePerSamplePairedEndFastqDirFmt, ErrorCorrectionDetailsFmt): seqs.ignore_description_mismatch = ignore_description_mismatch result = SingleLanePerSamplePairedEndFastqDirFmt() barcode_map, barcode_len = _make_barcode_map(barcodes, rev_comp_mapping_barcodes) if golay_error_correction: decoder = GolayDecoder() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') per_sample_fastqs = {} ec_details_fmt = ErrorCorrectionDetailsFmt() ec_details = ECDetails(ec_details_fmt) for i, record in enumerate(seqs, start=1): barcode_record, forward_record, reverse_record = record barcode_read = barcode_record[1] if rev_comp_barcodes: barcode_read = str(skbio.DNA(barcode_read).reverse_complement()) raw_barcode_read = barcode_read[:barcode_len] if golay_error_correction: # A three bit filter is implicitly used by the decoder. See Hamady # and Knight 2009 Genome Research for the justification: # # https://genome.cshlp.org/content/19/7/1141.full # # Specifically that "...Golay codes of 12 bases can correct all # triple-bit errors and detect all quadruple-bit errors." barcode_read, ecc_errors = decoder.decode(raw_barcode_read) golay_stats = [barcode_read, ecc_errors] else: barcode_read = raw_barcode_read golay_stats = [None, None] sample_id = barcode_map.get(barcode_read) record = [ f'record-{i}', sample_id, barcode_record[0], raw_barcode_read, ] ec_details.write(record + golay_stats) if sample_id is None: continue if sample_id not in per_sample_fastqs: barcode_id = len(per_sample_fastqs) + 1 fwd_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=1) rev_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=2) _maintain_open_fh_count(per_sample_fastqs, paired=True) per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'), gzip.open(str(rev_path), mode='a')) manifest_fh.write('%s,%s,%s\n' % (sample_id, fwd_path.name, 'forward')) manifest_fh.write('%s,%s,%s\n' % (sample_id, rev_path.name, 'reverse')) if per_sample_fastqs[sample_id][0].closed: _maintain_open_fh_count(per_sample_fastqs, paired=True) fwd, rev = per_sample_fastqs[sample_id] per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'), gzip.open(rev.name, mode='a')) fwd, rev = per_sample_fastqs[sample_id] fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8')) rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8')) if len(per_sample_fastqs) == 0: raise ValueError('No sequences were mapped to samples. Check that ' 'your barcodes are in the correct orientation (see ' 'the rev_comp_barcodes and/or ' 'rev_comp_mapping_barcodes options). If barcodes are ' 'NOT Golay format set golay_error_correction ' 'to False.') for fwd, rev in per_sample_fastqs.values(): fwd.close() rev.close() manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) _write_metadata_yaml(result) return result, ec_details_fmt
def test_fastq_manifest_format_validate_positive(self): filepath = self.get_data_path('single_end_data/MANIFEST') format = FastqManifestFormat(filepath, mode='r') format.validate()
def _join_pairs_w_command_output( demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt, truncqual: int = _jp_defaults['truncqual'], minlen: int = _jp_defaults['minlen'], maxns: int = _jp_defaults['maxns'], allowmergestagger: bool = _jp_defaults['allowmergestagger'], minovlen: int = _jp_defaults['minovlen'], maxdiffs: int = _jp_defaults['maxdiffs'], minmergelen: int = _jp_defaults['minmergelen'], maxmergelen: int = _jp_defaults['maxmergelen'], maxee: float = _jp_defaults['maxee'], qmin: int = _jp_defaults['qmin'], qminout: int = _jp_defaults['qminout'], qmax: int = _jp_defaults['qmax'], qmaxout: int = _jp_defaults['qmaxout'], ) -> (List[str], SingleLanePerSampleSingleEndFastqDirFmt): # this function exists only to simplify unit testing result = SingleLanePerSampleSingleEndFastqDirFmt() manifest = pd.read_csv(os.path.join(str(demultiplexed_seqs), demultiplexed_seqs.manifest.pathspec), header=0, comment='#') manifest.filename = manifest.filename.apply( lambda x: os.path.join(str(demultiplexed_seqs), x)) phred_offset = yaml.load( open( os.path.join( str(demultiplexed_seqs), demultiplexed_seqs.metadata.pathspec)))['phred-offset'] id_to_fps = manifest.pivot(index='sample-id', columns='direction', values='filename') output_manifest = FastqManifestFormat() output_manifest_fh = output_manifest.open() output_manifest_fh.write('sample-id,filename,direction\n') output_manifest_fh.write('# direction is not meaningful in this file ' 'as these\n') output_manifest_fh.write('# data may be derived from forward, reverse, ' 'or \n') output_manifest_fh.write('# joined reads\n') for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()): # The barcode id, lane number and read number are not relevant # here. We might ultimately want to use a dir format other than # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care # about this information. Similarly, the direction of the read # isn't relevant here anymore. path = result.sequences.path_maker(sample_id=sample_id, barcode_id=i, lane_number=1, read_number=1) uncompressed_path = str(path).strip('.gz') cmd = [ 'vsearch', '--fastq_mergepairs', fwd_fp, '--reverse', rev_fp, '--fastqout', uncompressed_path, '--fastq_ascii', str(phred_offset), '--fastq_minlen', str(minlen), '--fastq_minovlen', str(minovlen), '--fastq_maxdiffs', str(maxdiffs), '--fastq_qmin', str(qmin), '--fastq_qminout', str(qminout), '--fastq_qmax', str(qmax), '--fastq_qmaxout', str(qmaxout), ] if truncqual is not None: cmd += ['--fastq_truncqual', str(truncqual)] if maxns is not None: cmd += ['--fastq_maxns', str(maxns)] if minmergelen is not None: cmd += ['--fastq_minmergelen', str(minmergelen)] if maxmergelen is not None: cmd += ['--fastq_maxmergelen', str(maxmergelen)] if maxee is not None: cmd += ['--fastq_maxee', str(maxee)] if allowmergestagger: cmd.append('--fastq_allowmergestagger') run_command(cmd) run_command(['gzip', uncompressed_path]) output_manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward')) output_manifest_fh.close() result.manifest.write_data(output_manifest, FastqManifestFormat) metadata = YamlFormat() metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) return cmd, result
def main(per_sample_sequences: _SingleLanePerSampleFastqDirFmt, threads: int, taxa: str, region: str, paired: bool, cluster_id: float): """The main communication between the plugin and the ITSxpress program. Args: per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the input. threads (int) : The number of threads to use. taxa (str): The taxa to be used for the search. region (str) : The region to be used for the search. cluster_id (float):The percent identity for clustering reads, set to 1 for exact dereplication. Returns: (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the output. Raises: ValueError1: hmmsearch error. """ #Seeing if cluter_id is equal to 1 # Finding the artifact type. artifact_type = _view_artifact_type( per_sample_sequence=per_sample_sequences) # Setting the taxa taxa = _taxa_prefix_to_taxa(taxa) # Writing the manifest for the output qza manifest = FastqManifestFormat() manifest_fn = manifest.open() manifest_fn.write('sample-id,filename,direction\n') # Getting the sequences from the manifest sequences, single_end = _fastq_id_maker( per_sample_sequences=per_sample_sequences, artifact_type=artifact_type) barcode = 0 # Creating result dir if paired: results = SingleLanePerSamplePairedEndFastqDirFmt() else: results = SingleLanePerSampleSingleEndFastqDirFmt() # Running the for loop for each sample for sequence in sequences: # writing fastqs and there attributes and checking the files sequence_id, sobj = _set_fastqs_and_check( per_sample_sequences=per_sample_sequences, artifact_type=artifact_type, sequence=sequence, single_end=single_end, threads=threads) # Deduplicate if math.isclose(cluster_id, 1, rel_tol=1e-05): sobj.deduplicate(threads=threads) else: sobj.cluster(threads=threads, cluster_id=cluster_id) try: # HMMSearch for ITS regions hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa]) sobj._search(hmmfile=hmmfile, threads=threads) except (ModuleNotFoundError, FileNotFoundError, NotADirectoryError): raise ValueError( "hmmsearch was not found, make sure HMMER3 is installed and executable" ) # Parse HMMseach output. its_pos = itsxpress.ItsPosition(domtable=sobj.dom_file, region=region) # Create deduplication object. dedup_obj = itsxpress.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file, fastq=sobj.r1, fastq2=sobj.fastq2) path_forward = results.sequences.path_maker(sample_id=sequence_id, barcode_id=barcode, lane_number=1, read_number=1) path_reverse = results.sequences.path_maker(sample_id=sequence_id, barcode_id=barcode, lane_number=1, read_number=2) manifest_fn.write("{},{},forward\n".format(sequence_id, path_forward.name)) # Create trimmed sequences. if paired: dedup_obj.create_paired_trimmed_seqs(str(path_forward), str(path_reverse), gzipped=True, itspos=its_pos) else: dedup_obj.create_trimmed_seqs(str(path_forward), gzipped=True, itspos=its_pos) # Deleting the temp files. shutil.rmtree(sobj.tempdir) # Adding one to the barcode barcode += 1 # Writing out the results. manifest_fn.close() _write_metadata(results=results) results.manifest.write_data(manifest, FastqManifestFormat) return results
def main(fastq, fastq2, singleEnd, threads, taxa, region): """The main communtion between the pluin and the ITSxpress program. Args: fastq (str) : The first fastq location. fastq2 (str) : The second fastq location. singleEnd (bool) : boolean for if singleEnd is used or not. threads (int) : The number of threads to use. taxa (str): The taxa to be used for the search. region (str) : The region to be used for the search. Returns: (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the output. Raises: ValueError1: BBTools error or fastq format issue. ValueError2: BBmerge error. ValueError3: hmmsearch error. """ dirt = "/tmp" try: itsx._check_fastqs(fastq, fastq2) # Parse input types paired_end, interleaved = itsx._is_paired(fastq, fastq2, singleEnd) except: raise ValueError("There is a problem with the fastq file(s) you selected or\n" "BBtools was not found. check that the BBtools reformat.sh package is executable.") # Create SeqSample objects and merge if needed. try: if paired_end and interleaved: sobj = itsx.SeqSamplePairedInterleaved(fastq=fastq, tempdir=dirt) sobj._merge_reads(threads=threads) elif paired_end and not interleaved: sobj = itsx.SeqSamplePairedNotInterleaved(fastq=fastq, fastq2=fastq2, tempdir=dirt) sobj = itsx.SeqSampleNotPaired(fastq=fastq, tempdir=dirt) except: raise ValueError("BBmerge was not found. check that the BBmerge reformat.sh package is executible") # Deduplicate sobj._deduplicate(threads=threads) try: # HMMSearch for ITS regions hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa]) sobj._search(hmmfile=hmmfile, threads=threads) except: raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executible") # Parse HMMseach output. its_pos = itsx.ItsPosition(domtable=sobj.dom_file, region=region) # Create deduplication object. dedup_obj = itsx.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file) results = SingleLanePerSampleSingleEndFastqDirFmt() path = results.sequences.path_maker(sample_id="seq", barcode_id=1, lane_number=1, read_number=1) # Writing the manifest for the output qza manifest = FastqManifestFormat() manifest_fn = manifest.open() manifest_fn.write('sample-id,filename,direction\n') manifest_fn.write("seq,{},reverse".format(path)) manifest_fn.close() # Create trimmed sequences. dedup_obj.create_trimmed_seqs(str(path), gzipped=True, itspos=its_pos) # Writing out the results. _write_metadata(results) results.manifest.write_data(manifest, FastqManifestFormat) # Deleting the temp files. itsx.shutil.rmtree(sobj.tempdir) return results
def join_pairs(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt, threads: int = 1) -> SingleLanePerSampleSingleEndFastqDirFmt: result = SingleLanePerSampleSingleEndFastqDirFmt() manifest = pd.read_csv(os.path.join(str(demultiplexed_seqs), demultiplexed_seqs.manifest.pathspec), header=0, comment='#') manifest.filename = manifest.filename.apply( lambda x: os.path.join(str(demultiplexed_seqs), x)) phred_offset = yaml.load( open( os.path.join( str(demultiplexed_seqs), demultiplexed_seqs.metadata.pathspec)))['phred-offset'] id_to_fps = manifest.pivot(index='sample-id', columns='direction', values='filename') output_manifest = FastqManifestFormat() output_manifest_fh = output_manifest.open() output_manifest_fh.write('sample-id,filename,direction\n') output_manifest_fh.write('# direction is not meaningful in this file ' 'as these\n') output_manifest_fh.write('# data may be derived from forward, reverse, ' 'or \n') output_manifest_fh.write('# joined reads\n') for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()): # The barcode id, lane number and read number are not relevant # here. We might ultimately want to use a dir format other than # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care # about this information. Similarly, the direction of the read # isn't relevant here anymore. path = result.sequences.path_maker(sample_id=sample_id, barcode_id=i, lane_number=1, read_number=1) uncompressed_path = str(path).strip('.gz') parent_pth = Path(path).parent sample_id_path = str(parent_pth / sample_id) assembled_pth = parent_pth / "{}.assembled.fastq".format(sample_id) discarded_pth = parent_pth / "{}.discarded.fastq".format(sample_id) unassembled_fwd_pth = parent_pth / "{}.unassembled.forward.fastq".format( sample_id) unassembled_rev_pth = parent_pth / "{}.unassembled.reverse.fastq".format( sample_id) cmd = [ 'pear', '-f', fwd_fp, '-r', rev_fp, '-o', sample_id_path, '--threads', str(threads) ] run_command(cmd) assembled_pth.rename(Path(uncompressed_path)) run_command(['gzip', uncompressed_path]) #delete extra files extra_files = [discarded_pth, unassembled_fwd_pth, unassembled_rev_pth] for f_pth in extra_files: try: os.remove(str(f_pth)) except: pass output_manifest_fh.write('%s,%s,%s\n' % (sample_id, Path(path).name, 'forward')) output_manifest_fh.close() result.manifest.write_data(output_manifest, FastqManifestFormat) metadata = YamlFormat() metadata.path.write_text(yaml.dump({'phred-offset': phred_offset})) result.metadata.write_data(metadata, YamlFormat) return result
def test_fastq_manifest_format_validate_negative(self): filepath = self.get_data_path('not-MANIFEST') format = FastqManifestFormat(filepath, mode='r') with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'): format.validate()
def emp_single( seqs: BarcodeSequenceFastqIterator, barcodes: qiime2.CategoricalMetadataColumn, golay_error_correction: bool = True, rev_comp_barcodes: bool = False, rev_comp_mapping_barcodes: bool = False, ignore_description_mismatch: bool = False ) -> (SingleLanePerSampleSingleEndFastqDirFmt, ErrorCorrectionDetailsFmt): seqs.ignore_description_mismatch = ignore_description_mismatch result = SingleLanePerSampleSingleEndFastqDirFmt() barcode_map, barcode_len = _make_barcode_map(barcodes, rev_comp_mapping_barcodes) if golay_error_correction: decoder = GolayDecoder() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') manifest_fh.write('# direction is not meaningful in this file as these\n') manifest_fh.write('# data may be derived from forward, reverse, or \n') manifest_fh.write('# joined reads\n') per_sample_fastqs = {} ec_details_fmt = ErrorCorrectionDetailsFmt() ec_details = ECDetails(ec_details_fmt) for i, (barcode_record, sequence_record) in enumerate(seqs, start=1): barcode_read = barcode_record[1] if rev_comp_barcodes: barcode_read = str(skbio.DNA(barcode_read).reverse_complement()) raw_barcode_read = barcode_read[:barcode_len] if golay_error_correction: # A three bit filter is implicitly used by the decoder. See Hamady # and Knight 2009 Genome Research for the justification: # # https://genome.cshlp.org/content/19/7/1141.full # # Specifically that "...Golay codes of 12 bases can correct all # triple-bit errors and detect all quadruple-bit errors." barcode_read, ecc_errors = decoder.decode(raw_barcode_read) golay_stats = [barcode_read, ecc_errors] else: barcode_read = raw_barcode_read golay_stats = [None, None] sample_id = barcode_map.get(barcode_read) record = [ f'record-{i}', sample_id, barcode_record[0], raw_barcode_read, ] ec_details.write(record + golay_stats) if sample_id is None: continue if sample_id not in per_sample_fastqs: # The barcode id, lane number and read number are not relevant # here. We might ultimately want to use a dir format other than # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care # about this information. Similarly, the direction of the read # isn't relevant here anymore. barcode_id = len(per_sample_fastqs) + 1 path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=1) _maintain_open_fh_count(per_sample_fastqs) per_sample_fastqs[sample_id] = gzip.open(str(path), mode='a') manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward')) if per_sample_fastqs[sample_id].closed: _maintain_open_fh_count(per_sample_fastqs) per_sample_fastqs[sample_id] = gzip.open( per_sample_fastqs[sample_id].name, mode='a') fastq_lines = '\n'.join(sequence_record) + '\n' fastq_lines = fastq_lines.encode('utf-8') per_sample_fastqs[sample_id].write(fastq_lines) if len(per_sample_fastqs) == 0: raise ValueError('No sequences were mapped to samples. Check that ' 'your barcodes are in the correct orientation (see ' 'the rev_comp_barcodes and/or ' 'rev_comp_mapping_barcodes options). If barcodes are ' 'NOT Golay format set golay_error_correction ' 'to False.') for fh in per_sample_fastqs.values(): fh.close() manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) _write_metadata_yaml(result) return result, ec_details_fmt
def main(per_sample_sequences, threads, taxa, region): """The main communtion between the pluin and the ITSxpress program. Args: per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the input. threads (int) : The number of threads to use. taxa (str): The taxa to be used for the search. region (str) : The region to be used for the search. Returns: (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the output. Raises: ValueError1: BBTools error or fastq format issue. ValueError2: BBmerge error. ValueError3: hmmsearch error. """ # Setting a temp folder dirt = tempfile.tempdir # Setting the current dir os.chdir(str(per_sample_sequences.path)) # Finding the artifact type. artifactType = _view_artifact_type() # Setting the taxa taxa = _taxa_prefix_to_taxa(taxa) # Writing the manifest for the output qza manifest = FastqManifestFormat() manifest_fn = manifest.open() manifest_fn.write('sample-id,filename,direction\n') sequences,singleEnd = _fastq_id_maker(per_sample_sequences, artifactType) sequenceList = set(sequences) barcode = 0 # Creating result dir results = SingleLanePerSampleSingleEndFastqDirFmt() # Running the for loop for each sample for sequence in sequenceList: # Setting the fastq files and if singleEnd is used. fastq = os.path.join(str(per_sample_sequences.path),str(sequence[0])) if "SampleData[PairedEndSequencesWithQuality]" in artifactType: fastq2 = os.path.join(str(per_sample_sequences.path),str(sequence[1])) else: fastq2 = sequence[1] sequenceID = sequence[2] # Running the main ITSxpress program. try: itsx._check_fastqs(fastq, fastq2) # Parse input types paired_end, interleaved = itsx._is_paired(fastq, fastq2, singleEnd) except: raise ValueError("There is a problem with the fastq file(s) you selected or\n" "BBtools was not found. check that the BBtools reformat.sh package is executable.") # Create SeqSample objects and merge if needed. try: if paired_end and interleaved: sobj = itsx.SeqSamplePairedInterleaved(fastq=fastq, tempdir=dirt) sobj._merge_reads(threads=threads) elif paired_end and not interleaved: sobj = itsx.SeqSamplePairedNotInterleaved(fastq=fastq, fastq2=fastq2, tempdir=dirt) sobj._merge_reads(threads=threads) elif not paired_end and not interleaved: sobj = itsx.SeqSampleNotPaired(fastq=fastq, tempdir=dirt) except: raise ValueError("BBmerge was not found. check that the BBmerge reformat.sh package is executible") # Deduplicate sobj._deduplicate(threads=threads) try: # HMMSearch for ITS regions hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa]) sobj._search(hmmfile=hmmfile, threads=threads) except: raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executible") # Parse HMMseach output. its_pos = itsx.ItsPosition(domtable=sobj.dom_file, region=region) # Create deduplication object. dedup_obj = itsx.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file) pathForward = results.sequences.path_maker(sample_id=sequenceID, barcode_id=barcode, lane_number=1, read_number=1) manifest_fn.write("{},{},forward\n".format(sequenceID,pathForward.name)) # Create trimmed sequences. dedup_obj.create_trimmed_seqs(str(pathForward), gzipped=True, itspos=its_pos) # Deleting the temp files. itsx.shutil.rmtree(sobj.tempdir) barcode += 1 #Writing out the results. manifest_fn.close() _write_metadata(results) results.manifest.write_data(manifest, FastqManifestFormat) return results