예제 #1
0
    def from_string(cls, string, valid=False):
        """
    Create a numeric array from a string

    Parameters
    ----------
    string : str
    valid : optional bool
      *(default:* **False** *)*
      If **False**, validate the range of the numeric values, according
      to the array subtype. If **True** the string is guaranteed to be valid.

    Raises
    ------
    gfapy.ValueError
      If any value is not compatible with the subtype.
    gfapy.TypeError
      If the subtype code is invalid.

    Returns
    -------
    gfapy.NumericArray
      The numeric array
    """
        if not valid:
            if len(string) == 0:
                raise gfapy.FormatError(
                    "Numeric array string shall not be empty")
            if string[-1] == ",":
                raise gfapy.FormatError(
                    "Numeric array string ends with comma\n" +
                    "String: {}".format(string))
        elems = string.split(",")
        subtype = elems[0]
        if subtype not in NumericArray.SUBTYPE:
            raise gfapy.TypeError("Subtype {} unknown".format(subtype))
        if subtype != "f":
            range = NumericArray.SUBTYPE_RANGE[subtype]

        def gen():
            for e in elems[1:]:
                if subtype != "f":
                    try:
                        e = int(e)
                    except:
                        raise gfapy.ValueError(
                            "Value is not valid: {}\n".format(e) +
                            "Numeric array string: {}".format(string))
                    if not valid and not (e >= range[0] and e < range[1]):
                        raise gfapy.ValueError(
                            ("NumericArray: " +
                             "value is outside of subtype {0} range\n" +
                             "Value: {1}\n" + "Range: {2}\n" +
                             "Content: {3}").format(subtype, e, repr(range),
                                                    repr(elems)))
                    yield e
                else:
                    yield float(e)

        return cls(list(gen()))
예제 #2
0
    def _from_string(cls, string, valid=False, version="gfa1"):
        """Create a CIGAR instance from its string representation.

    Parameters:
      string (str)
      valid (bool): If **True** the string is guaranteed to be valid.
        (Defaults to **False**)
      version (str): 'gfa1' or 'gfa2'

    Returns:
      ~gfapy.alignment.cigar.CIGAR or
      ~gfapy.alignment.placeholder.AlignmentPlaceholder

    Raises:
      ~gfapy.error.FormatError: If the string is not a valid CIGAR string.
    """
        if string == "*":
            return gfapy.AlignmentPlaceholder()
        cigar = CIGAR()
        if not valid:
            if version == "gfa1":
                if not re.match(r"^([0-9]+[MIDNSHPX=])+$", string):
                    raise gfapy.FormatError()
            elif version == "gfa2":
                if not re.match(r"^([0-9]+[MIDP])+$", string):
                    raise gfapy.FormatError()
        for m in re.finditer("([0-9]+)([MIDNSHPX=])", string):
            cigar.append(CIGAR.Operation(int(m.group(1)), m.group(2)))
        return cigar
예제 #3
0
  def _from_string(cls, string, version = "gfa2", valid = False):
    """
    Parses an alignment field

    Parameters
    ----------
    string : str
      The string to parse.
    version : str
      GFA version (gfa1 or gfa2)
      If *gfa1*, then CIGARs and Placeholders are supported.
      If *gfa2*, also Traces are supported.
      Defaults to *gfa2*.
    valid : bool
      If *True*, the string is guaranteed to be valid, and
      further checks are skipped.
      Defaults to *False*.

    Returns
    -------
    gfapy.CIGAR or gfapy.Trace or gfapy.AlignentPlaceholder

    Raises
    ------
    gfapy.FormatError
      If the content of the field cannot be parsed.
    gfapy.VersionError
      If a wrong value is provided for the version parameter.
    """
    if version != "gfa1" and version != "gfa2":
      raise gfapy.VersionError(
          "Version error: {}".format(repr(version)))
    first = True
    for char in string:
      if first:
        if char.isdigit():
          first = False
          continue
        elif char == "*" and len(string) == 1:
          return gfapy.AlignmentPlaceholder()
      else:
        if char.isdigit():
          continue
        elif char == ",":
          if version == "gfa2":
            t = gfapy.Trace._from_string(string)
            if not valid:
              t.validate()
            return t
          else:
            raise gfapy.FormatError(
                  "Trace alignments are not allowed in GFA1: {}"
                  .format(repr(string)))
        elif char in ["M","I","D","P"] or (char in ["=","X","S","H","N"]
            and version == "gfa1"):
          return gfapy.CIGAR._from_string(string, valid=valid, version=version)
      break
    raise gfapy.FormatError("Alignment field contains invalid data {}"
                            .format(repr(string)))
예제 #4
0
def validate_encoded(string):
    if not re.match(r"^[!-)+-<>-~][!-~]*$", string):
        raise gfapy.FormatError(
            "{} is not a valid GFA1 segment name\n".format(repr(string)) +
            "(it does not match the regular expression [!-)+-<>-~][!-~]*")
    elif re.search(r"[+-],", string):
        raise gfapy.FormatError(
            "{} is not a valid GFA1 segment name\n".format(repr(string)) +
            "(it contains + or - followed by ,)")
예제 #5
0
def validate_encoded(string):
    if not re.match(r"^[!-~]+$", string):
        raise gfapy.FormatError(
            "{} is not a valid custom record type\n".format(repr(string)) +
            "(it contains spaces and/or non-printable characters)")
    elif string in ["E", "G", "F", "O", "U", "H", "#", "S"]:
        raise gfapy.FormatError(
            "{} is not a valid custom record type\n".format(repr(string)) +
            "(it is a predefined GFA2 record type)")
예제 #6
0
def validate_decoded(obj):
  if isinstance(obj, gfapy.OrientedLine):
    if not re.match("^[!-~]+$", obj.name):
      raise gfapy.FormatError(
          "{} is not a valid oriented GFA2 identifier\n".format(repr(obj.name)))
    if obj.orient != "+" and obj.orient != "-":
      raise gfapy.FormatError(
          "{} is not a valid orientation\n".format(repr(obj.orient)))
  else:
    raise gfapy.TypeError(
      "the class {} is incompatible with the datatype\n"
      .format(obj.__class__.__name__)+
      "(accepted classes: gfapy.OrientedLine)")
def validate_decoded(iterable):
  for elem in iterable:
    if not isinstance(elem, gfapy.OrientedLine):
      raise gfapy.TypeError(
            "the list contains an object of class {}\n".format(type(elem))+
            "(accepted classes: gfapy.OrientedLine)")
    elem.validate()
    if not re.match(r"^[!-~]+$", elem.name):
      raise gfapy.FormatError(
        "the list contains an invalid GFA2 identifier {}\n".format(elem.name)+
        "(it contains spaces and/or non-printable characters)")
    if not elem.orient in ["+", "-"]:
      raise gfapy.FormatError(
        "{} is not a valid orientation".format(elem.orient))
예제 #8
0
파일: parser.py 프로젝트: ujjwalsh/gfapy
    def _parse_gfa_tag(tag):
        """
    Parses a GFA tag in the form **xx:d:content** into its components.
    The **content** is not decoded (see :func:`_parse_gfa_field`).

    Parameters
    ----------
    tag : str
      the GFA tag to parse

    Raises
    ------
    gfapy.FormatError
      if the string does not represent a valid GFA tag

    Returns
    -------
    list of (str, gfapy.Field.FIELD_DATATYPE)
      the parsed content of the field
    """
        match = re.match(r"^([A-Za-z][A-Za-z0-9]):([AifZJHB]):(.+)$", tag)
        if match:
            return [match.group(1), match.group(2), match.group(3)]
        else:
            raise gfapy.FormatError("Expected GFA tag, found: {}".format(
                repr(tag)))
예제 #9
0
  def _substring_type(self, begpos, endpos):
    """Type of substring (pfx, sfx, whole, internal) given start and end pos.

    Analyzes the begin and end position and determine if the substring is
    the whole string, or a (possibly empty) other substring, ie a prefix,
    a suffix, or an internal alignment.
    """
    if gfapy.posvalue(begpos) > gfapy.posvalue(endpos):
      raise gfapy.ValueError(
        "Line: {}\n".format(str(self))+
        "begin > end: {}$ > {}".format(gfapy.posvalue(begpos),
                                       gfapy.posvalue(endpos)))
    if gfapy.isfirstpos(begpos):
      if gfapy.isfirstpos(endpos):
        return ("pfx", True)
      elif gfapy.islastpos(endpos):
        return ("whole", False)
      else:
        return ("pfx", False)
    elif gfapy.islastpos(begpos):
      if not gfapy.islastpos(endpos):
        raise gfapy.FormatError(
          "Line: {}\n".format(str(self))+
          "Wrong use of $ marker\n"+
          "{} >= {}$".format(gfapy.posvalue(endpos),
                             gfapy.posvalue(begpos)))
      return ("sfx", True)
    else:
      if gfapy.islastpos(endpos):
        return ("sfx", False)
      else:
        return ("internal",
            gfapy.posvalue(begpos) == gfapy.posvalue(endpos))
예제 #10
0
 def _from_string(cls, string):
     try:
         return Trace([int(v) for v in string.split(",")])
     except:
         raise gfapy.FormatError(
             "string does not encode" +
             " a valid trace alignment: {}".format(string))
예제 #11
0
    def set_datatype(self, fieldname, datatype):
        """
    Set the datatype of a tag.

    If an existing tag datatype is changed, its content may become
    invalid (call **validate_field** if necessary).

    Parameters
    ----------
    fieldname : str
      The field name (it is not required that the field exists already)
    datatype : gfapy.Field.FIELD_DATATYPE
      The datatype.

    Raises
    ------
    gfapy.ArgumentError
      If **datatype** is not a valid datatype for tags.
    """
        if self._is_predefined_tag(fieldname):
            if self.get_datatype(fieldname) != datatype:
                raise gfapy.RuntimeError(
                    "Cannot set the datatype of {} to {}\n".format(
                        fieldname, datatype) +
                    "The datatype of a predefined tag cannot be changed")
        elif not self._is_valid_custom_tagname(fieldname) and self.vlevel > 0:
            raise gfapy.FormatError(
                "{} is not a valid custom tag name".format(fieldname))
        if datatype not in gfapy.Field.TAG_DATATYPE:
            raise gfapy.ArgumentError("Unknown datatype: {}".format(datatype))
        self._datatype[fieldname] = datatype
예제 #12
0
def validate_encoded(string):
    if not re.match(r"^(\*|[-+]?[0-9]+)$", string):
        raise gfapy.FormatError(
            "{} does not represent a valid optional integer value\n".format(
                repr(string)) +
            "(it is not * and does not match the regular expression [-+]?[0-9]+)"
        )
예제 #13
0
 def _validate_tagnames_and_types(self):
     for n in self.tagnames:
         if self._is_predefined_tag(n):
             self._validate_predefined_tag_type(n, self._field_datatype(n))
         elif not self._is_valid_custom_tagname(n):
             raise gfapy.FormatError("Custom tags must be lower case\n" +
                                     "Found: {}".format(n))
예제 #14
0
def validate_decoded(iterable):
  for elem in iterable:
    elem = gfapy.OrientedLine(elem)
    elem.validate()
    if not re.match(r"^[!-)+-<>-~][!-~]*$", elem.name):
      raise gfapy.FormatError(
        "{} is not a valid GFA1 segment name\n".format(elem.name)+
        "(it does not match [!-)+-<>-~][!-~]*)")
예제 #15
0
 def _validate_tagnames_and_types(self):
     for n in self.tagnames:
         if self._is_predefined_tag(n):
             self._validate_predefined_tag_type(n, self._field_datatype(n))
         elif not self._is_valid_custom_tagname(n):
             raise gfapy.FormatError(
                 "Custom tag names must consist in a letter " +
                 "and a digit or two letters\nFound: {}".format(n))
예제 #16
0
def validate_encoded(string):
    if not re.match(
            r"^(\*|(([0-9]+[MIDNSHPX=])+))(,(\*|(([0-9]+[MIDNSHPX=])+)))*$",
            string):
        raise gfapy.FormatError(
            "{} is not a comma separated list of * or CIGARs\n".format(
                repr(string)) +
            "(CIGAR strings must match ([0-9]+[MIDNSHPX=])+)")
예제 #17
0
def validate_encoded(string):
  if not re.match(r"^[!-)+-<>-~][!-~]*[+-](,[!-)+-<>-~][!-~]*[+-])+$", string):
    raise gfapy.FormatError(
      "{} is not a valid list of GFA1 segment names ".format(repr(string))+
      "and orientations\n"+
      "(the segment names must match [!-)+-<>-~][!-~]*;\n"+
      " the orientations must be + or -;\n"+
      " the list must be comma-separated "+
      "NameOrient,NameOrient[,NameOrient...])")
예제 #18
0
def decode(string):
    if string == "*":
        return gfapy.Placeholder()
    else:
        try:
            return int(string)
        except:
            raise gfapy.FormatError(
                "the string does not represent a valid integer")
예제 #19
0
def validate_encoded(string):
    if not re.match(
            r"^(f(,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+|[CSI](,\+?[0-9]+)+|[csi](,[-+]?[0-9]+)+)$",
            string):
        raise gfapy.FormatError(
            "{} is not a valid numeric array string\n".format(repr(string)) +
            "(it must be one of [fcsiCSI] followed by a comma-separated list of:"
            +
            " for f: floats; for csi: signed integers; for CSI: unsigned integers)"
        )
예제 #20
0
 def _init_comment_data(data):
     if isinstance(data, list) and (data[0] != "#"):
         # unproperly splitten, rejoin
         data = "\t".join(data)
     if isinstance(data, str):
         match = re.match(r"^#(\s*)(.*)$", data)
         if match is None:
             raise gfapy.FormatError("Comment lines must begin with #\n" +
                                     "Line: {}".format(data))
         data = ["#", match.group(2), match.group(1)]
     return data
예제 #21
0
 def _initialize_positional_fields(self, strings):
     if strings[0] != self.RECORD_TYPE and self.RECORD_TYPE != "\n":
         raise gfapy.FormatError(
             "Record type of records of " +
             "class {} must be {} ({} found)".format(
                 self.__class__, self.RECORD_TYPE, strings[0]))
     if self.version is None:
         raise gfapy.AssertionError("Bug found, please report\n" +
                                    "strings: {}".format(repr(strings)))
     if (self.vlevel >= 1) and (len(strings) - 1 <
                                self._n_positional_fields):
         raise gfapy.FormatError(
             "{} positional fields expected, ".format(
                 self._n_positional_fields) +
             "{} found\n{}".format(len(strings) - 1, repr(strings)))
     for i, n in enumerate(self.POSFIELDS):
         self._init_field_value(n,
                                self.__class__.DATATYPE[n],
                                strings[i + 1],
                                errmsginfo=strings)
예제 #22
0
 def __validate_line(self):
     if isinstance(self.line, gfapy.Line):
         string = self.line.name
     elif isinstance(self.line, str):
         string = self.line
     else:
         raise gfapy.TypeError(
             "Invalid class ({}) for line reference ({})".format(
                 self.line.__class__, self.line))
     if not re.match(r"^[!-~]+$", string):
         raise gfapy.FormatError(
             "{} is not a valid GFA identifier\n".format(repr(string)) +
             "(it contains spaces or non-printable characters)")
예제 #23
0
    def set(self, fieldname, value):
        """Set the value of a field.

    If a datatype for a new custom tag is not set,
    the default for the value assigned to the field will be used
    (e.g. J for Hashes, i for Integer, etc).

    Parameters
    ----------
    fieldname : str
      The name of the field to set.
      (positional field, predefined tag (uppercase) or custom tag (lowercase))

    Raises
    ------
    gfapy.FormatError
      If **fieldname** is not a valid predefined or
      custom tag name (and **validate["tags"]**).

    Returns
    -------
    object
      **value**
    """
        if fieldname in self._data or self._is_predefined_tag(fieldname):
            return self._set_existing_field(fieldname, value)
        elif fieldname in self.__class__.FIELD_ALIAS:
            return self.set(self.__class__.FIELD_ALIAS[fieldname], value)
        elif self.virtual:
            raise gfapy.RuntimeError("Virtual lines do not have tags")
        elif (self.vlevel == 0) or self._is_valid_custom_tagname(fieldname):
            self._define_field_methods(fieldname)
            if self._datatype.get(fieldname, None) is not None:
                return self._set_existing_field(fieldname, value)
            elif value is not None:
                self._datatype[
                    fieldname] = gfapy.Field._get_default_gfa_tag_datatype(
                        value)
                self._data[fieldname] = value
                return self._data[fieldname]
        else:
            raise gfapy.FormatError(
                "{} is not a positional field,".format(fieldname) +
                "an existing tag, an alias, a predefined tag or a valid custom tag\n"
                + "positional fields: {}\n".format(", ".join(
                    self.positional_fieldnames)) +
                "existing tags: {}\n".format(", ".join(self.tagnames)) +
                "aliases: {}\n".format(", ".join(
                    self.__class__.FIELD_ALIAS.keys())) +
                "predefined tags: {}\n".format(", ".join(
                    self.__class__.PREDEFINED_TAGS)))
예제 #24
0
파일: lastpos.py 프로젝트: ujjwalsh/gfapy
 def _from_string(cls, string, valid=False):
     if string[-1] == "$":
         return cls(int(string[:-1]), valid=valid)
     else:
         try:
             v = int(string)
         except:
             raise gfapy.FormatError(
                 "LastPos value has a wrong format: {}".format(string))
         if not valid:
             if v < 0:
                 raise gfapy.ValueError("LastPos value shall be >= 0," +
                                        " {} found".format(v))
         return v
예제 #25
0
 def _subclass(data):
     n_positionals = len(data) - 1
     for i in range(len(data) - 1, 0, -1):
         if not re.search(r"^..:.:.*$", data[i]):
             break
         n_positionals = i - 1
     if n_positionals == 2:
         return gfapy.line.segment.GFA1
     elif n_positionals == 3:
         return gfapy.line.segment.GFA2
     else:
         raise gfapy.FormatError(
             "Wrong number of positional fields for "
             "segment line; GFA1=2, GFA2=3, found={}\n".format(
                 n_positionals))
예제 #26
0
def validate_decoded(obj):
    if isinstance(obj, list):
        for elem in obj:
            if isinstance(elem, gfapy.Line):
                elem = str(elem.name)
            elif not isinstance(elem, str):
                raise gfapy.TypeError("the list contains an obj of class {}\n".
                                      format(elem.__class__.__name__) +
                                      "(accepted classes: str, gfapy.Line)")
            if not re.match("^[!-~]+$", elem):
                raise gfapy.FormatError(
                    "the list contains an invalid GFA2 identifier ({})\n".
                    format(repr(string)) +
                    "(it contains spaces and/or non-printable characters)")
    else:
        raise gfapy.TypeError(
            "the class {} is incompatible with the datatype\n".format(
                obj.__class__.__name__) + "(accepted classes: list)")
예제 #27
0
  def _from_list(cls, array, version = "gfa2", valid = True):
    """
    Converts an alignment array into a specific list type

    Parameters
    ----------
    array : list
      The alignment array.
    version : str
      GFA version (gfa1 or gfa2)
      If *gfa1*, then CIGARs and Placeholders are supported.
      If *gfa2*, also Traces are supported.
      Defaults to *gfa2*.
    valid : bool
      If *True*, the list is guaranteed to be valid, and
      further checks are skipped.
      Defaults to *False*.

    Returns
    -------
    gfapy.CIGAR or gfapy.Trace
    """
    if version != "gfa1" and version != "gfa2":
      raise gfapy.VersionError(
          "Version error: {}".format(repr(version)))
    if not array:
      return gfapy.AlignmentPlaceholder()
    elif isinstance(array[0], int):
      if version == "gfa2":
        return gfapy.Trace(array)
      else:
        raise gfapy.VersionError(
          "Trace alignments are not allowed in GFA1: {}".format(repr(array)))
    elif isinstance(array[0], gfapy.CIGAR.Operation):
      return gfapy.CIGAR(array)
    else:
      raise gfapy.FormatError(
        "Array does not represent a valid alignment field: {}"
        .format(repr(array)))
예제 #28
0
def decode(string):
    try:
        return int(string)
    except:
        raise gfapy.FormatError(
            "the string does not represent a valid integer")
예제 #29
0
def validate_encoded(string):
    if not re.match("^[!-~]+$", string):
        raise gfapy.FormatError(
            "{} is not a valid GFA2 optional identifier\n".format(repr(
                string)) + "(it contains spaces or non-printable characters)")
예제 #30
0
def validate_encoded(string):
    if string.find("\n") != -1 or string.find("\t") != -1:
        raise gfapy.FormatError(
            "{} is not a valid field content\n".format(repr(string)) +
            "(it contains newlines and/or tabs)")