def from_line(cls, line, line_number=None): """Reads a single line in the MAF file header. If a formatting error is encountered, returns (error, None), otherwise returns (None, record). Formatting errors include: - the line does not start with the correct symbol (i.e. #) - the line is missing a space separator for the key and value - the line has an empty key - the line has an empty value """ error = None record = None if not line.startswith(MafHeader.HeaderLineStartSymbol): error = MafValidationError( MafValidationErrorType.HEADER_LINE_MISSING_START_SYMBOL, "Header line did not start with a '#'", line_number=line_number) else: tokens = line[1:].split(" ", 1) if len(tokens) != 2: error = MafValidationError( MafValidationErrorType.HEADER_LINE_MISSING_SEPARATOR, "Header line did not have a key and value separated by a " "space", line_number=line_number) else: key, value = tokens value = value.rstrip() if not key: error = MafValidationError( MafValidationErrorType.HEADER_LINE_EMPTY_KEY, "Header line had an empty key", line_number=line_number) elif not value: error = MafValidationError( MafValidationErrorType.HEADER_LINE_EMPTY_VALUE, "Header line had an empty value", line_number=line_number) elif key == MafHeader.VersionKey: record = MafHeaderVersionRecord(value=value) elif key == MafHeader.AnnotationSpecKey: record = MafHeaderAnnotationSpecRecord(value=value) elif key == MafHeader.SortOrderKey: try: record = MafHeaderSortOrderRecord(value=value) except: error = MafValidationError( MafValidationErrorType. HEADER_UNSUPPORTED_SORT_ORDER, "Sort order '%s' was not recognized" % value, line_number=line_number) elif key == MafHeader.ContigKey: record = MafHeaderContigRecord(value=value) else: record = MafHeaderRecord(key=key, value=value) return record, error
def test_process_validation_errors_strict(self): logger = Logger.get_logger("test_process_validation_errors_strict") with self.assertRaises(MafFormatException) as context: MafValidationError.process_validation_errors( validation_errors=TestMafValidationError.__errors, validation_stringency=ValidationStringency.Strict, logger=logger, ) self.assertIn("Error with line number", str(context.exception)) self.assertTrue(context.exception.tpe, MafValidationErrorType.HEADER_LINE_EMPTY_KEY)
def validate(self, reset_errors=True, scheme=None, line_number=None): """ This method should not be overridden by sub-classes. Checks to see if the value is one of the nullable values. If not, calls ``__validate__``. If no message was returned, calls ``validate`` on the super-class. :return: a list of validation errors, if any. """ if reset_errors: self.validation_errors = list() nullable_values = self.__nullable_values__() if nullable_values is not None and self.value in nullable_values: msg = None else: msg = self.__validate__() if msg is not None: error = MafValidationError( MafValidationErrorType.RECORD_COLUMN_WRONG_FORMAT, "%s in column with name '%s'" % (msg, self.key), line_number=line_number) self.validation_errors.append(error) return super(MafCustomColumnRecord, self).validate( reset_errors=False, # we reset above! scheme=scheme, line_number=line_number)
def __update_scheme__(self, scheme=None, column_names=None): def add_error(error): self.validation_errors.append(error) self.__scheme = self.__header.scheme() # Set the scheme if given, but check that they match, otherwise, # add an error if scheme is not None: if self.__scheme is not None \ and scheme.version() != self.__scheme.version(): add_error(MafValidationError( MafValidationErrorType.HEADER_MISMATCH_SCHEME, "Version in the header '%s' did not match the expected " "version '%s'" % (self.__scheme.version(), scheme.version()) )) self.__scheme = scheme # If there are column names, and either there is no scheme or the scheme # is the "no restrictions anything goes" scheme, then use the "no # restrictions" scheme with the given column names. if column_names is not None and \ (self.__scheme is None or isinstance(self.__scheme, NoRestrictionsScheme)): if self.validation_stringency is not ValidationStringency.Silent: self.__logger.warn( "No matching scheme was found in the header, defaulting " "to the least restrictive scheme.") self.__scheme = NoRestrictionsScheme(column_names=column_names)
def validate(self, reset_errors=True, scheme=None, line_number=None): """ Validates that the value is of the correct type and an acceptable value. :return: a list of validation errors found, if any. """ if reset_errors: self.validation_errors = list() if scheme: def add_errors(error): """Adds an error""" self.validation_errors.append(error) scheme_column_index = scheme.column_index(name=self.key) scheme_column_class = scheme.column_class(name=self.key) if scheme_column_index is None: add_errors( MafValidationError( MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES, "No column '%s' present in the scheme '%s'" % (self.key, scheme.version()), line_number=line_number)) elif self.column_index is not None and scheme_column_index != \ self.column_index: add_errors( MafValidationError( MafValidationErrorType.RECORD_COLUMN_OUT_OF_ORDER, "Column with name '%s' was found in the %dth column" ", but expected the %dth column with scheme " "'%s''" % (self.key, self.column_index, scheme_column_index, scheme.version()), line_number=line_number)) elif not isinstance(self, scheme_column_class): add_errors( MafValidationError( MafValidationErrorType.RECORD_COLUMN_WRONG_FORMAT, "Column with name '%s' is in the wrong format. " "Found '%s' expected '%s'" % (self.key, str( self.__class__), str(scheme_column_class)), line_number=line_number)) return self.validation_errors
def test_process_validation_errors_lenient(self): err_stream = tempfile.NamedTemporaryFile(delete=False, mode="w") err_file_name = err_stream.name logger = Logger.get_logger(err_file_name, stream=err_stream) MafValidationError.process_validation_errors( validation_errors=TestMafValidationError.__errors, validation_stringency=ValidationStringency.Lenient, logger=logger, ) err_stream.close() reader = open(err_file_name, "r") actual_lines = reader.readlines() expected_lines = [ MafValidationError.ignore_message(error) for error in TestMafValidationError.__errors ] reader.close() self.assertTrue(len(actual_lines) == len(expected_lines)) for actual_line, expected_line in zip(actual_lines, expected_lines): self.assertIn(expected_line, actual_line)
def from_lines( cls, lines: List[str], validation_stringency: ValidationStringency = None, logger: logging.Logger = Logger.RootLogger, ) -> 'MafHeader': """ :param lines: a sequence of lines :param validation_stringency: optionally the validation stringency to use, otherwise use the default (Silent) :param logger the logger to which to write errors :return: a MafHeader """ header = cls(validation_stringency=validation_stringency) def add_error(error: MafValidationError) -> None: header.validation_errors.append(error) for line_number, line in enumerate(lines): line_number = line_number + 1 # 1-based record, error = MafHeaderRecord.from_line(line, line_number) if error: assert record is None add_error(error) else: assert record is not None if record.key in header: add_error( MafValidationError( MafValidationErrorType.HEADER_DUPLICATE_KEYS, "Multiple header lines with key '%s' found" % record.key, line_number=line_number, ) ) else: header[record.key] = record if header.contigs(): if header.sort_order() and issubclass( header.sort_order().__class__, Coordinate ): sokey = header[MafHeader.SortOrderKey].value.name() header[MafHeader.SortOrderKey] = MafHeaderSortOrderRecord( value=sokey, contigs=header.contigs() ) header.validate(logger=logger, reset_errors=False) return header
class TestMafValidationError(unittest.TestCase): __errors = ( MafValidationError( tpe=MafValidationErrorType.HEADER_LINE_EMPTY_KEY, message="Error with line number", ), MafValidationError( tpe=MafValidationErrorType.HEADER_LINE_MISSING_START_SYMBOL, message="Error without line number", line_number=42, ), ) def test_str(self): actual_strings = [ str(error) for error in TestMafValidationError.__errors ] expected_strings = [ "Error with line number", "On line number 42: Error without line number", ] self.assertTrue(len(actual_strings) == len(expected_strings)) for actual, expect in zip(actual_strings, expected_strings): self.assertIn(expect, actual) def test_process_validation_errors_strict(self): logger = Logger.get_logger("test_process_validation_errors_strict") with self.assertRaises(MafFormatException) as context: MafValidationError.process_validation_errors( validation_errors=TestMafValidationError.__errors, validation_stringency=ValidationStringency.Strict, logger=logger, ) self.assertIn("Error with line number", str(context.exception)) self.assertTrue(context.exception.tpe, MafValidationErrorType.HEADER_LINE_EMPTY_KEY) def test_process_validation_errors_lenient(self): err_stream = tempfile.NamedTemporaryFile(delete=False, mode="w") err_file_name = err_stream.name logger = Logger.get_logger(err_file_name, stream=err_stream) MafValidationError.process_validation_errors( validation_errors=TestMafValidationError.__errors, validation_stringency=ValidationStringency.Lenient, logger=logger, ) err_stream.close() reader = open(err_file_name, "r") actual_lines = reader.readlines() expected_lines = [ MafValidationError.ignore_message(error) for error in TestMafValidationError.__errors ] reader.close() self.assertTrue(len(actual_lines) == len(expected_lines)) for actual_line, expected_line in zip(actual_lines, expected_lines): self.assertIn(expected_line, actual_line)
def __init__(self, lines, closeable=None, validation_stringency=None, scheme=None): """ Initializes a MAF reader and reads in the header and column definitions. If no scheme is provided, the scheme will be determined from the version and annotation pragmas in the header, and matched against the known set of schemes. If the scheme is not recognized, then the column names will determine a custom scheme and no assumption is made about the values of each column. :param lines: the lines (iterable) from the MAF file. :param closeable: any closeable object (has a ``close()`` method) that will be closed when ``close()`` is called. :param validation_stringency: the validation stringency. :param scheme: a scheme that should be used to override the scheme in the header. """ self.__iter = iter(lines) self.__closeable = closeable self.validation_stringency = \ ValidationStringency.Silent if (validation_stringency is None) \ else validation_stringency self.__logger = Logger.get_logger(self.__class__.__name__) self.validation_errors = list() self.__next_line = None self.__line_number = 0 def add_error(error): self.validation_errors.append(error) # read in the header lines header_lines = list() while True: self.__next_line__() if self.__next_line is not None \ and self.__next_line.startswith(MafHeader.HeaderLineStartSymbol): header_lines.append(self.__next_line) else: break self.__header = \ MafHeader.from_lines( lines=header_lines, validation_stringency=self.validation_stringency) for error in self.__header.validation_errors: add_error(error) # get the column names if self.__next_line is not None: column_names = self.__next_line.split(MafRecord.ColumnSeparator) self.__next_line__() else: column_names = None # update the scheme self.__update_scheme__(scheme=scheme, column_names=column_names) # validate the column names against the scheme if column_names is not None: # match the column names against the scheme scheme_column_names = self.__scheme.column_names() if len(column_names) != len(scheme_column_names): add_error(MafValidationError( MafValidationErrorType.SCHEME_MISMATCHING_NUMBER_OF_COLUMN_NAMES, "Found '%d' columns but expected '%d'" % (len(column_names), len(scheme_column_names)), line_number=self.__line_number - 1 )) else: for i, (column_name, scheme_column_name) in \ enumerate(zip(column_names, scheme_column_names)): if column_name != scheme_column_name: add_error(MafValidationError( MafValidationErrorType.SCHEME_MISMATCHING_COLUMN_NAMES, "Found column with name '%s' but expected '%s' for " "the '%d'th column" % (column_name, scheme_column_name, i + 1), line_number=self.__line_number - 1 )) else: add_error(MafValidationError( MafValidationErrorType.HEADER_MISSING_COLUMN_NAMES, "Found no column names", line_number=self.__line_number+1 )) # process validation errors so far MafValidationError.process_validation_errors( validation_errors=self.validation_errors, validation_stringency=self.validation_stringency, name=self.__class__.__name__, logger=self.__logger )
def validate( self, validation_stringency: ValidationStringency = None, logger: logging.Logger = Logger.RootLogger, reset_errors: bool = True, ) -> List[MafValidationError]: """Validates the header and returns a list of errors. Ensures that: - there is a version line in the header - the version is supported - the annotation specification is not in the header if the scheme is basic - the annotation specification is in the header if the scheme is basic - the annotation specification, when present, is supported """ if reset_errors: self.validation_errors = list() def add_error(error: MafValidationError) -> None: self.validation_errors.append(error) # get the scheme! scheme = self.scheme() if not validation_stringency: validation_stringency = self.validation_stringency # ensure there's a version record if MafHeader.VersionKey not in self: add_error( MafValidationError( MafValidationErrorType.HEADER_MISSING_VERSION, "No version line found in the header", ) ) else: # ensure that the version is a supported version version = self[MafHeader.VersionKey].value if version not in MafHeader.SupportedVersions: add_error( MafValidationError( MafValidationErrorType.HEADER_UNSUPPORTED_VERSION, "The version '%s' is not supported" % version, ) ) # Check the annotation spec # 1. basic annotation specs should not be in the header # 2. non-basic annotation specs should be present (in the header) and # have a known value if scheme is not None and scheme.is_basic(): if MafHeader.AnnotationSpecKey in self: add_error( MafValidationError( MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, "Unexpected annotation.spec line found in the header", ) ) else: if MafHeader.AnnotationSpecKey not in self: add_error( MafValidationError( MafValidationErrorType.HEADER_MISSING_ANNOTATION_SPEC, "No annotation.spec line found in the header", ) ) else: # ensure that the annotation spec is a supported annotation spec annotation = self[MafHeader.AnnotationSpecKey].value if annotation not in MafHeader.SupportedAnnotationSpecs: add_error( MafValidationError( MafValidationErrorType.HEADER_UNSUPPORTED_ANNOTATION_SPEC, "The annotation.spec '%s' is not supported" % annotation, ) ) # process validation errors MafValidationError.process_validation_errors( validation_errors=self.validation_errors, validation_stringency=validation_stringency, logger=logger, ) return self.validation_errors
def from_line( cls, line: str, column_names: Optional[List[str]] = None, scheme: Optional['MafScheme'] = None, line_number: Optional[int] = None, validation_stringency: ValidationStringency = ValidationStringency. Strict, logger: logging.Logger = Logger.RootLogger, ) -> 'MafRecord': """ Parses a record from a single tab-delimited line. :param column_names: the expected names of the columns, in order, otherwise will use the scheme. :param line: the line to parse. :param scheme: an optional MafScheme :param line_number: the optional line number. :param validation_stringency: the optional validation stringency for the record :param logger the logger to which to write errors :return: """ record = cls(line_number=line_number, validation_stringency=validation_stringency) if column_names is None: if scheme is None: raise ValueError("Either column_names or scheme must be given") column_names = scheme.column_names() def add_errors(error: MafValidationError) -> None: record.validation_errors.append(error) column_values = line.rstrip("\r\n").split(cls.ColumnSeparator) if len(column_names) != len(column_values): add_errors( MafValidationError( MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS, f"Found '{len(column_values)}' columns but expected '{len(column_names)}'", line_number=line_number, )) record.validate(logger=logger, reset_errors=False) return record for column_index, (column_name, column_value) in enumerate( zip(column_names, column_values)): column = None scheme_column_class = (scheme.column_class( name=column_name) if scheme else None) # A validation error will be found later if we don't find the # column name if scheme_column_class is None: column = MafColumnRecord(key=column_name, value=column_value, column_index=column_index) else: try: scheme_column_class = scheme.column_class( name=column_name) # type: ignore column = scheme_column_class.build( # type: ignore name=column_name, value=column_value, column_index=column_index, ) except Exception as error: add_errors( MafValidationError( MafValidationErrorType.RECORD_INVALID_COLUMN_VALUE, f"Could not build column '{column_index+1}' with name '{column_name}' scheme '{scheme.version()}': {error}", # type: ignore line_number=line_number, )) if column is not None: column_validation_errors = column.validate( scheme=scheme, line_number=line_number) record.validation_errors.extend( column_validation_errors) # type: ignore if len(column_validation_errors) == 0: record[column_name] = column # process validation errors record.validate(logger=logger, reset_errors=False) return record
def validate( self, validation_stringency: Optional[ValidationStringency] = None, logger: logging.Logger = Logger.RootLogger, reset_errors: bool = True, scheme: Optional['MafScheme'] = None, ) -> List[MafValidationError]: """ Collects a list of validation errors. :return: the list of validation errors, if any. """ if reset_errors: self.validation_errors = list() found_none_column = False if not validation_stringency: validation_stringency = self.validation_stringency def add_errors(error: MafValidationError) -> None: self.validation_errors.append(error) # Validate the # of columns against the given scheme if scheme and len(scheme) != len(self): add_errors( MafValidationError( MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS, f"Found '{len(self)}' columns but expected '{len(scheme)}'", )) # find any columns that have None in the list or dictionary for i, column in enumerate(self.__columns_list): if not column: # NB: I am not sure if this that useful of an error to report # when the column could not be built successfully? found_none_column = True add_errors( MafValidationError( MafValidationErrorType.RECORD_COLUMN_WITH_NO_VALUE, f"Column '{i+1}' had no value", line_number=self.__line_number, )) else: # add any validation errors from the column itself. self.validation_errors.extend( column.validate(reset_errors=reset_errors, scheme=scheme) # type: ignore ) # if we did not find any None columns, then do a bunch of internal # self-consistency checking. if not found_none_column: # double-check the dictionary for columns with None values. for name in self.__columns_dict: assert self.__columns_dict[name] is not None # validate we have the same # of columns in the list as in the dict assert len(self.__columns_dict) == len(self.__columns_list) # validate we have the same columns in the list as in the dict assert (sorted(self.__columns_dict.values(), key=lambda r: r.column_index) # type: ignore == self.__columns_list) # ensure that all records' column_index match the index in the list for (column_index, column) in enumerate(self.__columns_list): assert column_index == column.column_index # type: ignore # TODO: validate cross-column constraints (ex. Mutation_Status) # TODO: validate that chromosome/start/end are defined # process validation errors MafValidationError.process_validation_errors( validation_errors=self.validation_errors, validation_stringency=validation_stringency, logger=logger, ) return self.validation_errors
def from_line(cls, line, column_names=None, scheme=None, line_number=None, validation_stringency=None, logger=Logger.RootLogger): """ Parses a record from a single tab-delimited line. :param column_names: the expected names of the columns, in order, otherwise will use the scheme. :param line: the line to parse. :param scheme: an optional MafScheme :param line_number: the optional line number. :param validation_stringency: the optional validation stringency for the record :param logger the logger to which to write errors :return: """ record = MafRecord(line_number=line_number, validation_stringency=validation_stringency) if column_names is None: if scheme is None: raise ValueError("Either column_names or scheme must be given") column_names = scheme.column_names() def add_errors(error): record.validation_errors.append(error) column_values = line.rstrip("\r\n").split(MafRecord.ColumnSeparator) if len(column_names) != len(column_values): add_errors( MafValidationError( MafValidationErrorType.RECORD_MISMATCH_NUMBER_OF_COLUMNS, "Found '%d' columns but expected '%d'" % (len(column_values), len(column_names)), line_number=line_number)) else: for column_index, column_name_and_value in \ enumerate(zip(column_names, column_values)): column_name = column_name_and_value[0] column_value = column_name_and_value[1] column = None scheme_column_class = \ scheme.column_class(name=column_name) if scheme else None # A validation error will be found later if we don't find the # column name if scheme_column_class is None: column = MafColumnRecord(key=column_name, value=column_value, column_index=column_index) else: try: scheme_column_class = \ scheme.column_class(name=column_name) column = scheme_column_class.build( name=column_name, value=column_value, column_index=column_index) except Exception as error: add_errors( MafValidationError( MafValidationErrorType. RECORD_INVALID_COLUMN_VALUE, "Could not build column '%d' with name '%s' " "with the scheme '%s': %s" % (column_index + 1, column_name, scheme.version(), str(error)), line_number=line_number, )) if column is not None: column_validation_errors = \ column.validate(scheme=scheme, line_number=line_number) record.validation_errors.extend(column_validation_errors) if len(column_validation_errors) == 0: record[column_name] = column # process validation errors record.validate(logger=logger, reset_errors=False) return record