def test_2(self): """This file has many errors""" validators = [ ValidateTxtCatRecord(), ValidateTxtDogRecord(), ValidateTxtFile()] file_path = "file.doc" contents = ["cat dog ", "cat dog bird", "cat dog tree"] # records with errors contents.extend(["fish"] * 5) _write_example_file(contents, file_path) errors, metrics = run_validators(_to_max_errors(2), file_path, TextFileReader, validators) os.remove(file_path) self.assertEqual(len(errors), 2) errors, metrics = run_validators(_to_max_records(1), file_path, TextFileReader, validators) self.assertEqual(len(errors), 1)
def test_1(self): """This is a valid file format""" validators = [ValidateTxtCatRecord(), ValidateTxtDogRecord(), ValidateTxtFile()] file_path = "file.txt" contents = ["cat dog ", "cat dog bird", "cat dog tree"] _write_example_file(contents, file_path) errors, metrics = run_validators(ValidatorContextFirstError, file_path, TextFileReader, validators) os.remove(file_path) self.assertEqual(len(errors), 0)
def test_3(self): """Test for consistent behavior when a validator is broken""" validators = [ValidateTxtCatRecord(), ValidateTxtDogRecord(), ValidateTxtFile(), ValidateBad()] file_path = "file.txt" contents = ["cat dog ", "cat dog bird", "cat dog tree"] contents.extend(["fish"] * 1) _write_example_file(contents, file_path) errors, metrics = run_validators(ValidatorErrorContext, file_path, TextFileReader, validators) self.assertEqual(len(errors), 6) os.remove(file_path)
def validate_bam(file_name, reference=None, aligned=None, contents=None, quick=False, max_errors=None, max_records=None, validate_index=False): """ Main API entry point for running BAM validation. Example: .. doctest:: >>> from pbcoretools.pbvalidate.bam import validate_bam >>> from pbcore import data >>> bam_file = data.getBamAndCmpH5()[0] >>> errors, metrics = validate_bam(file_name=bam_file) >>> len(errors) 231 >>> print(errors[0]) Mismatch between specified and expected read group ID: a9a22406c5 in file, but computed as b89a4406 >>> unmapped_file = data.getUnalignedBam() >>> errors, metrics = validate_bam(file_name=unmapped_file) >>> len(errors) 118 >>> print(errors[0]) This file has not been sorted by QNAME, or the header has not been updated. >>> errors, metrics = validate_bam(file_name=unmapped_file, ... aligned=True, contents="CCS") >>> len(errors) 120 """ validators = get_validators(aligned=aligned, contents=contents, validate_index=validate_index) e, m = run_validators( context_class=get_context_class( quick=quick, max_errors=max_errors, max_records=max_records), path=file_name, reader_class=_get_reader(file_name, reference), validators=validators, additional_validation_function=validate_read_groups) return e, m
def validate_dataset( file_name, dataset_type=None, reference=None, quick=False, max_errors=None, max_records=None, contents=None, aligned=None, validate_index=False, strict=False, permissive_headers=False): assert os.path.isfile(os.path.realpath(file_name)) ds = None ReaderClass = getattr(pbcore.io, str(dataset_type), pbcore.io.openDataSet) log.debug("ReaderClass: %s" % ReaderClass.__name__) try: # XXX suppressing logging errors temporarily #logging.disable(logging.CRITICAL) try: ds = ReaderClass(file_name, strict=True) finally: pass #logging.disable(logging.NOTSET) except Exception as e: # XXX in strict mode the reader will cough up an IOError if the # requested dataset type does not agree with the XML. if this happens # there's no point doing any additional validation. if False: #True: # XXX actually, it can cough up other errors too if there is # something wrong with the underlying files and it tries to read # them immediately. Still treating this as a validation error, but # it may indicate bugs. _, _, ex_traceback = sys.exc_info() tb_lines = traceback.format_exception(e.__class__, e, ex_traceback) log.error("\n".join(tb_lines)) errors = [ReaderError.from_args(file_name, str(e))] return errors, {} log.debug("Dataset type: %s" % ds.__class__.__name__) actual_dataset_type = _dataset_type(ds) log.debug("Actual type: %s" % actual_dataset_type) if isinstance(ds, pbcore.io.SubreadSet) and contents is None: contents = "SUBREAD" elif isinstance(ds, pbcore.io.ConsensusReadSet) and contents is None: contents = "CCS" elif isinstance(ds, pbcore.io.AlignmentSet): pass validators = [ ValidateEncoding(), ValidateRootTag(), ValidateResources(), ValidateDatasetType(dataset_type), ValidateMetadata(), ValidateNamespace(), ValidateRandomAccess(), ] if not actual_dataset_type in DatasetTypes.HDF5_DATASET: validators.extend([ ValidateResourcesOpen(), ValidateNumRecords(), ]) if validate_index: validators.append(ValidateIndex()) if strict: validators.extend([ ValidateXML(), ValidateFileName(file_name), ]) additional_validation_function = None opened_class_name = ds.__class__.__name__ # XXX not sure this is ideal - what if it opens as a ReferenceSet but we # asked for an AlignmentSet? This is caught by ValidateDatasetType, but # we'd still check for Fasta file errors. if opened_class_name in DatasetTypes.FASTA_DATASET: validators_ = fasta.get_validators(validate_raw_format=False) validators_.insert(0, ValidateFastaRaw()) validators.extend(validators_) elif opened_class_name in DatasetTypes.BAM_DATASET: validators_ = bam.get_validators(aligned=aligned, contents=contents, include_file_validators=False, permissive_headers=permissive_headers) validators_.insert(0, ValidateSorting()) validators_.insert(0, ValidateContents(aligned=aligned, content_type=contents)) validators.extend(validators_) additional_validation_function = _validate_read_groups def ReaderClass_wrapper(*args, **kwds): logging.disable(logging.CRITICAL) try: return DatasetReader(ReaderClass, *args, **kwds) finally: logging.disable(logging.NOTSET) context_class = get_context_class( quick=quick, max_errors=max_errors, max_records=max_records) errors, metrics = run_validators( context_class=context_class, path=file_name, reader_class=ReaderClass_wrapper, validators=validators, additional_validation_function=additional_validation_function) return errors, metrics