def from_line(cls, line: str) -> "TRNAScanRecord": sline = MULTISPACE_REGEX.split(line.rstrip()) if ((len(sline) != len(cls.column_order)) and (len(sline) != len(cls.column_order) - 1)): raise LineParseError( "Line had the wrong number of columns. " f"Expected {len(cls.column_order)} or " f"{len(cls.column_order) - 1} but got {len(sline)}.") record: Dict[str, str] = { k.strip(): v.strip() for k, v in zip(cls.column_order, sline) } start = parse_int(record["start"], "start") end = parse_int(record["end"], "end") num = parse_int(record["num"], "num") infernal_score = parse_float(record["infernal_score"], "infernal_score") if record["intron_starts"] == "0" and record["intron_ends"] == "0": intron_starts: List[int] = [] intron_ends: List[int] = [] else: intron_starts = [ parse_int(i.strip(), "intron_starts") for i in record["intron_starts"].split(",") ] intron_ends = [ parse_int(i.strip(), "intron_ends") for i in record["intron_ends"].split(",") ] return cls( seqid=parse_string_not_empty(record["seqid"], "seqid"), start=start, end=end, trna_type=parse_string_not_empty(record["trna_type"], "trna_type"), anticodon=parse_string_not_empty(record["anticodon"], "anticodon"), num=num, intron_starts=intron_starts, intron_ends=intron_ends, infernal_score=infernal_score, note=record.get("note", None), )
def from_line(cls, line: str) -> "Coords": if line.strip() == "": raise LineParseError("The line was empty") sline = line.strip().split("\t") if len(sline) < 13: raise LineParseError("The line had the wrong number of columns. " f"Expected at least 13 but got {len(sline)}.") rstart = parse_int(sline[0], "reference_start") rend = parse_int(sline[1], "reference_end") # This shouldn't ever happen AFAIK assert rstart <= rend, line rstart -= 1 qstart = parse_int(sline[2], "query_start") qend = parse_int(sline[3], "query_end") if qstart > qend: strand = Strand.MINUS temp = qstart qstart = qend - 1 qend = temp del temp else: strand = Strand.PLUS qstart -= 1 # This isn't as type safe as I'd like return cls( rstart, rend, qstart, qend, strand, parse_int(sline[4], "reference_alnlen"), parse_int(sline[5], "query_alnlen"), parse_float(sline[6], "pid"), parse_int(sline[7], "reference_len"), parse_int(sline[8], "query_len"), parse_float(sline[9], "reference_cov"), parse_float(sline[10], "query_cov"), parse_string_not_empty(sline[11], "reference"), parse_string_not_empty(sline[12], "query"), )
def from_line(cls, line: str) -> "DomTbl": if line == "": raise LineParseError("The line was empty.") sline = MULTISPACE_REGEX.split(line.strip(), maxsplit=22) if len(sline) != 22 and len(sline) != 23: # Technically because of the max_split this should be impossible. # the description line is allowed to have spaces. print(sline) raise LineParseError("The line had the wrong number of columns. " f"Expected 22 or 23 but got {len(sline)}") if sline[22] == "-" or sline[22] == "": description: Optional[str] = None else: description = sline[22] return cls(parse_string_not_empty(sline[0], "target_name"), parse_string_not_empty(sline[1], "target_acc"), parse_int(sline[2], "target_len"), parse_string_not_empty(sline[3], "query_name"), parse_string_not_empty(sline[4], "query_acc"), parse_int(sline[5], "query_len"), parse_float(sline[6], "full_evalue"), parse_float(sline[7], "full_score"), parse_float(sline[8], "full_bias"), parse_int(sline[9], "match_num"), parse_int(sline[10], "nmatches"), parse_float(sline[11], "domain_c_evalue"), parse_float(sline[12], "domain_i_evalue"), parse_float(sline[13], "domain_score"), parse_float(sline[14], "domain_bias"), parse_int(sline[15], "hmm_from"), parse_int(sline[16], "hmm_to"), parse_int(sline[17], "ali_from"), parse_int(sline[18], "ali_to"), parse_int(sline[19], "env_from"), parse_int(sline[20], "env_to"), parse_float(sline[21], "acc"), description)
def from_line(cls, line: str) -> 'PAF': sline = line.strip().split("\t") if len(sline) < len(cls.columns()): raise LineParseError( "The line had the wrong number of columns. " f"Expected at least {len(cls.columns())} but got {len(sline)}." ) dline = dict(zip(cls.columns(), sline)) attrs = sline[len(cls.columns()):] return cls(parse_string_not_empty(dline["query"], "query"), parse_int(dline["qlen"], "qlen"), parse_int(dline["qstart"], "qstart"), parse_int(dline["qend"], "qend"), is_one_of(dline["strand"], ["+", "-"], "strand"), parse_string_not_empty(dline["target"], "target"), parse_int(dline["tlen"], "tlen"), parse_int(dline["tstart"], "tstart"), parse_int(dline["tend"], "tend"), parse_int(dline["nmatch"], "nmatch"), parse_int(dline["alilen"], "alilen"), parse_int(dline["mq"], "mq"), attrs)
def parse( cls, string: str, attr: Type[AttrT] = cast(Type[AttrT], GFF3Attributes), strip_quote: bool = False, unescape: bool = False, ) -> "GFFRecord[AttrT]": """ Parse a gff line string as a `GFFRecord`. Keyword arguments: string -- The gff line to parse. format -- What format the gff file is in. Currently only GFF3 is supported. strip_quote -- Strip quotes from attributes values. The specification says that they should not be stripped, so we don't by default. unescape -- Unescape reserved characters in the attributes to their original values. I.E. some commas, semicolons, newlines etc. Returns: A `GFFRecord` """ sline = string.strip().split("\t") sline_len = len(sline) columns_len = len(cls.columns) if sline_len == columns_len - 1: logger.warning(("Line has has too few columns columns. " "Probably it is missing the attributes"), ) elif sline_len < columns_len: raise ValueError( ("Line has too few columns. " f"Expected: {columns_len}, Encountered: {sline_len}")) elif sline_len > columns_len: logger.warning( "Line has too many columns. Expected: %s, Encountered: %s", columns_len, sline_len) fields: Dict[str, str] = dict(zip(cls.columns, sline)) if sline_len == columns_len - 1: fields["attributes"] = "" # 0-based indexing exclusive start = parse_int(fields["start"], "start") - 1 end = parse_int(fields["end"], "end") if start > end: tmp = start start = end end = tmp del tmp score = parse_or_none(fields["score"], "score", ".", parse_float) strand = Strand.parse( is_one_of(fields["strand"], ["-", "+", ".", "?"], "strand")) phase = Phase.parse( is_one_of(fields["phase"], ["0", "1", "2", "."], "phase")) attributes = cast( AttrT, attr.parse( fields["attributes"], strip_quote=strip_quote, unescape=unescape, )) return cls(parse_string_not_empty(fields["seqid"], "seqid"), parse_string_not_empty(fields["source"], "source"), parse_string_not_empty(fields["type"], "type"), start, end, score, strand, phase, attributes)