def mp_parse_nfasta_header(header: str): # E.g. "10001|vfid|47953|vsiid|68790|ssid|SubName: Full=Leader peptidase PilD; SubName: Full=Type 4 prepilin peptidase VcpD; SubName: Full=Type IV-A prepilin peptidase PilD;" _REGEXES = { "vfid": ("^([^|]+)[ ]*\|vfid[\|]*", "^([^|]+[ ]*\|vfid[\|]*)"), "vsiid": ("^([^|]+)[ ]*\|vsiid[\|]*", "^([^|]+[ ]*\|vsiid[\|]*)"), "ssid": ("^([^|]+)[ ]*\|ssid[\|]*", "^([^|]+[ ]*\|ssid[\|]*)"), "feature_names": ("^[ ]*([^|]+)$", "^([ ]*[^|]+)$") } out = regex_based_tokenization(_REGEXES, header) for key, value in out.items(): value = value.strip() if value.isnumeric(): out[key] = int(value) out["former_id"] = out.pop("source_string") out.update({ k: safe_findall(v, out["feature_names"]) for k, v in { "gene_host": "\[([^\]]+)\] *$", "recname_full": "[^_]*RecName:[_ ]Full=([^;]+);", "subname_full": "[^_]*SubName:[_ ]Full=([^;]+);", }.items() }) return out
def mp_parse_nfasta_header(header: str): _VFDB_REGEXES = { "VFID": ("^([^\(\)]+)", "^([^\(\)]+)"), "gene_host": ("\[([^\]]+)\] *$", "(\[[^\]]+\] *$)"), "gene_name": ("\[([^\]]+)\] *$", "(\[[^\]]+\] *$)"), "gene_description": ("([^\(\)]+)$", "([^\(\)]+)$"), "gene_symbol": ("\(([^\(\)]+)\) *$", "^\([^\(\)]+\) *$"), "gene_accession_id": ("^\(([^\(\)]+)\)", "^\([^\(\)]+\) *"), } out = regex_based_tokenization(_VFDB_REGEXES, header) out["former_id"] = out.pop("source_string") out["vfdb_number"] = int(safe_findall("[0-9]+", out["VFID"])) return out
def mp_parse_pfasta_header(header: str): out = regex_based_tokenization( { "tadb_id": ("^TADB\|([^ ]+) *", "(^TADB\|[^ ]+ *)"), "protein_symbol": (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"), "protein_host": (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"), "protein_geninfo_id": ("^gi\|([0-9]+)\|*", "(^gi\|[0-9]+\|*)"), "protein_refseq_id": ("^[\| ]*ref\|([^\|]+)[\| ]*", "(^[\| ]*ref\|[^\|]+[\| ]*)"), "protein_description": ("(.*)", "(.*)"), }, header) out["protein_header"] = out.pop("source_string") return out
def tokenize_reads_file_name(s: str): d = regex_based_tokenization( { "extension": ["\.(.{2,8})$", "(\..{2,8})$"], # E.g. '.fastq.gz' "last_segment": [ "[^A-Za-z0-9]([A-Za-z0-9]+)$", "([^A-Za-z0-9][A-Za-z0-9]+)$" ], # The last segment is always 001, "read_index": ["[^A-Za-z0-9](R[0-9]+)$", "([^A-Za-z0-9]R[0-9]+)$"], "lane_number": ["[^A-Za-z0-9](L[0-9]+)$", "([^A-Za-z0-9]L[0-9]+)$"], "sample_sheet_number": ["[^A-Za-z0-9](S[0-9]+)$", "([^A-Za-z0-9]S[0-9]+)$"], "sample_name": ["(.+)", "(.+)"], }, os.path.basename(s)) d["reads_file"] = s return d
def mp_parse_nfasta_header(header: str): out = regex_based_tokenization( { "tadb_id": ("^TADB\|([^ ]+) *", "(^TADB\|[^ ]+ *)"), "gene_symbol": (" *\[([^\[\]]+)\] *$", "( *\[[^\[\]]+\] *$)"), "gene_geninfo_id": ("^gi\|([0-9]+)[\| ]*", "(^gi\|[0-9]+[\|]*)"), "gene_refseq_id": ("^[\| ]*ref\|([^\|]+)[\| ]*", "(^[\| ]*ref\|[^\|]+[\| ]*)"), "dna_strand": ("^[\| ]*:([c]*)", "(^[\| ]*:[c]*)"), "start_locus": ("^([0-9]+)[ -]*", "(^[0-9]+[ -]*)"), "end_locus": ("^[ -]*([0-9]+)[ -]*", "(^[ -]*[0-9]+[ -]*)"), "gene_description": ("(.*)", "(.*)"), }, header) out["former_id"] = out.pop("source_string") out["is_antisense_dna_strand"] = out["dna_strand"] == "c" out["tadb_number"] = safe_findall("^[A-Z]*([0-9]+)", str(out["tadb_id"]).upper(), verbose=False) return out