def init_from_re_match(cls, match: Match, ent_class, num=None, increment_t=False): """ Creates a new Entity from a regex Match. :param match: A Match object :param ent_class: The type of entity this is :param num: The number for this entity; defaults to the current entity count held by the class. :param increment_t: Whether or not to increment the T number :return: A new Entity """ if not isinstance(match, Match): raise TypeError("Argument is not a Match object.") new_entity = cls( num=cls.t if num is None else num, tag=ent_class, start=match.start(), end=match.end(), text=match.string[match.start():match.end()], ) if num is None and increment_t: # Increment the counter cls.t += 1 return new_entity
def process_define(self, m: Match) -> None: self.emit_segment(m.start(m.lastindex + 1)) identifier = m.group(m.lastindex + 2) reserved_macro_names = ("include", "ifdef", "ifndef", "elsif", "else", "endif", "define", "undef", "line", "__LINE__", "__FILE__") if identifier in reserved_macro_names: self.env.msg.fatal( "Macro name '%s' is a reserved keyword" % identifier, self.get_err_src_ref(m.start(m.lastindex + 2), m.end(m.lastindex + 2) - 1)) self._scan_idx = m.end(m.lastindex + 2) has_args = bool(m.group(m.lastindex + 3)) if has_args: args = self.define_arg_scanner() else: args = [] contents = self.define_contents_scanner() # Create macro defintion object macro = Macro(contents, args) if self._conditional.is_active: self._macro_defs[identifier] = macro self._seg_start_idx = self._scan_idx
def process_else(self, m: Match) -> None: self.emit_segment(m.start(m.lastindex + 1)) if not self._conditional.is_in_if_block or self._conditional.is_in_else_block: self.env.msg.fatal( "Unexpected `else", self.get_err_src_ref(m.start(m.lastindex + 1), m.end(m.lastindex + 1) - 1)) self._conditional.do_else() self._scan_idx = m.end() self._seg_start_idx = self._scan_idx
def footnote_reference(self, match: Match, lineno: int) -> DispatchResult: """Handles footnote/citation references, e.g. [1]_""" label = match.group("footnotelabel") refname = normalize_name(label) string = match.string before = string[: match.start("whole")] remaining = string[match.end("whole") :] if match.group("citationlabel"): refnode = nodes.citation_reference(f"[{label}]_", refname=refname) refnode += nodes.Text(label) self.document.note_citation_ref(refnode) else: refnode = nodes.footnote_reference(f"[{label}]_") if refname[0] == "#": refname = refname[1:] refnode["auto"] = 1 self.document.note_autofootnote_ref(refnode) elif refname == "*": refname = "" refnode["auto"] = "*" self.document.note_symbol_footnote_ref(refnode) else: refnode += nodes.Text(label) if refname: refnode["refname"] = refname self.document.note_footnote_ref(refnode) if get_trim_footnote_ref_space(self.document.settings): before = before.rstrip() return (before, [refnode], remaining, [])
def validate_match(self, match: Match, text: str): # If the match doesn't contains "year" part, it will not be ambiguous and it's a valid match is_valid_match = not RegExpUtility.get_group(match, Constants.YEAR_GROUP_NAME) if not is_valid_match: year_group = RegExpUtility.get_group(match, Constants.YEAR_GROUP_NAME) # If the "year" part is not at the end of the match, it's a valid match if not text.index(year_group) + len(year_group) == text.index( match.group()) + (match.end() - match.start()): is_valid_match = True else: sub_text = text[text.index(year_group):] # If the following text (include the "year" part) doesn't start with a Date entity, it's a valid match if not self.starts_with_basic_date(sub_text): is_valid_match = True else: # If the following text (include the "year" part) starts with a Date entity, # but the following text (doesn't include the "year" part) also starts with a # valid Date entity, the current match is still valid # For example, "10-1-2018-10-2-2018". Match "10-1-2018" is valid because # though "2018-10-2" a valid match (indicates the first year "2018" might # belongs to the second Date entity), but "10-2-2018" is also sub_text = text[text.index(year_group) + len(year_group):].strip() sub_text = self.trim_start_range_connector_symbols( sub_text) is_valid_match = self.starts_with_basic_date(sub_text) return is_valid_match
def match_group_replace( match: typing.Match, replace_func: typing.Callable[[typing.Match, int, int], str]) -> str: """Replace groups in match Args: match (Match): Regex match replace_func (function): Takes the match, group index, and replace offset, returns replacement string Returns: str: Replaced string result """ string = match.group(0) result = '' last = 0 for idx in range(1, len(match.groups()) + 1): if match.start(idx) == -1: continue result += string[last:match.start(idx) - match.start()] result += replace_func(match, idx, match.start(idx) - len(result)) last = max(match.end(idx) - match.start(), last) result += string[last:] return result
def process_endif(self, m: Match) -> None: self.emit_segment(m.start(m.lastindex + 1)) if self._conditional.is_idle and not self._conditional_stack: self.env.msg.fatal( "Unexpected `endif", self.get_err_src_ref(m.start(m.lastindex + 1), m.end(m.lastindex + 1) - 1)) self._conditional.do_endif() self._scan_idx = m.end() self._seg_start_idx = self._scan_idx if self._conditional_stack: # Pop conditional context self._conditional = self._conditional_stack.pop()
def _arxiv_id_sub(match: Match, id_to_url: Callable[[str], str]) \ -> Tuple[Markup, str]: """Return match.string transformed for a arxiv id match.""" aid = match.group('arxiv_id') prefix = 'arXiv:' if match.group('arxiv_prefix') else '' if aid[-1] in _bad_endings: arxiv_url = id_to_url(aid)[:-1] anchor = aid[:-1] back = aid[-1] + match.string[match.end():] else: arxiv_url = id_to_url(aid) anchor = prefix + aid back = match.string[match.end():] front = match.string[0:match.start()] return (Markup(f'{front}<a href="{arxiv_url}">{anchor}</a>'), back)
def _doi_sub(match: Match, doi_to_url: Callable[[str], str]) \ ->Tuple[Markup, str]: """Return match.string transformed for a DOI match.""" doi = match.group('doi') if (doi[-1] in _bad_endings): back = match.string[match.end():] + doi[-1] doi = doi[:-1] else: back = match.string[match.end():] quoted_doi = quote(doi, safe='/') doi_url = f'https://dx.doi.org/{quoted_doi}' doi_url = doi_to_url(doi_url) anchor = escape(doi) front = match.string[0:match.start()] return (Markup(f'{front}<a href="{doi_url}">{anchor}</a>'), back)
def process_undef(self, m: Match) -> None: self.emit_segment(m.start(m.lastindex + 1)) if self._conditional.is_active: identifier = m.group(m.lastindex + 2) self._macro_defs.pop(identifier, None) self._scan_idx = m.end() self._seg_start_idx = self._scan_idx
def _encoding_filter(self, m: typing.Match) -> str: """ This will encode any illegal characters in an identifier using Python's re.sub function. """ matched_span = m.string[m.start():m.end()] if self._collapse_whitespace_when_encoding and matched_span.isspace(): if self._whitespace_encoding_char is not None: return self._whitespace_encoding_char else: return self.encode_character(" ") else: return "".join(map(self.encode_character, matched_span))
def _match_replacement(self, m: Match) -> str: m_all = m.group(0) offset = m.start() anchor = m.group(1) try: href = '/{}'.format(self._refs.find(anchor)) except RefNotFound as err: print('RefNotFound: {}'.format(err)) href = '/{}'.format(anchor) return m_all[:(m.start(1) - offset - 1)] + href + m_all[(m.end(1) - offset):]
def _url_sub(match: Match, url_to_url: Callable[[str], str]) \ ->Tuple[Markup, str]: """Return match.string transformed for a URL match.""" url = match.group('url') if url.startswith('https'): anchor = 'this https URL' elif url.startswith('http'): anchor = 'this http URL' elif url.startswith('ftp'): anchor = 'this ftp URL' else: anchor = 'this URL' front = match.string[0:match.start()] if url[-1] in _bad_endings: back = url[-1] + match.string[match.end():] url = url[:-1] else: back = match.string[match.end():] url = url_to_url(url) return (Markup(f'{front}<a href="{url}">{anchor}</a>'), back)
def process_ifndef(self, m: Match) -> None: self.emit_segment(m.start(m.lastindex + 1)) identifier = m.group(m.lastindex + 2) if not self._conditional.is_idle: # push conditional context self._conditional_stack.append(self._conditional) self._conditional = ConditionalState(self._conditional.is_active) self._conditional.do_ifndef(identifier in self._macro_defs) self._scan_idx = m.end() self._seg_start_idx = self._scan_idx
def _handle_match(html: str, match: Match, nested: bool) -> Tuple[str, int]: start, end = match.start(), match.end() prefix, sigil, text, suffix = match.groups() if nested: text = _convert_formatting(text) tag = tags[sigil] # We don't want to include the whitespace suffix length, as that could be used as the # whitespace prefix right after this formatting block. pos = start + len(prefix) + (2 * len(tag) + 5) + len(text) html = (f"{html[:start]}{prefix}" f"<{tag}>{text}</{tag}>" f"{suffix}{html[end:]}") return html, pos
def replacer(match: Match) -> str: puppet = pu.Puppet.find_by_displayname(match.group(2)) if puppet: offset = match.start() length = match.end() - offset if puppet.username: entity = MessageEntityMention(offset, length) text = f"@{puppet.username}" else: entity = MessageEntityMentionName(offset, length, user_id=puppet.tgid) text = puppet.displayname entities.append(entity) return text return "".join(match.groups())
def quoted_start(self, match: Match) -> bool: """Test if inline markup start-string is 'quoted'. 'Quoted' in this context means the start-string is enclosed in a pair of matching opening/closing delimiters (not necessarily quotes) or at the end of the match. """ string = match.string start = match.start() if start == 0: # start-string at beginning of text return False prestart = string[start - 1] try: poststart = string[match.end()] except IndexError: # start-string at end of text return True # not "quoted" but no markup start-string either return punctuation_chars.match_chars(prestart, poststart)
def reference( self, match: Match, lineno: int, anonymous: bool = False ) -> DispatchResult: """Handle simple references, e.g. reference_ and anonymous__ """ referencename = match.group("refname") refname = normalize_name(referencename) referencenode = nodes.reference( referencename + match.group("refend"), referencename, name=whitespace_normalize_name(referencename), ) referencenode[0].rawsource = referencename if anonymous: referencenode["anonymous"] = 1 else: referencenode["refname"] = refname self.document.note_refname(referencenode) string = match.string matchstart = match.start("whole") matchend = match.end("whole") return (string[:matchstart], [referencenode], string[matchend:], [])
def inline_obj( self, match: Match, lineno: int, end_pattern: Pattern, nodeclass: nodes.TextElement, restore_backslashes: bool = False, ): """Create the node for an inline class, if the end string match can be found.""" string = match.string matchstart = match.start("start") matchend = match.end("start") if self.quoted_start(match): return (string[:matchend], [], string[matchend:], [], "") endmatch = end_pattern.search(string[matchend:]) if endmatch and endmatch.start(1): # 1 or more chars _text = endmatch.string[: endmatch.start(1)] text = unescape(_text, restore_backslashes) textend = matchend + endmatch.end(1) rawsource = unescape(string[matchstart:textend], True) node = nodeclass(rawsource, text) node[0].rawsource = unescape(_text, True) return ( string[:matchstart], [node], string[textend:], [], endmatch.group(1), ) msg = self.reporter.warning( "Inline %s start-string without end-string." % nodeclass.__name__, line=lineno, ) text = unescape(string[matchstart:matchend], True) rawsource = unescape(string[matchstart:matchend], True) prb = self.problematic(text, rawsource, msg) return string[:matchstart], [prb], string[matchend:], [msg], ""
def replace_token(token: Token, old: str, new: Any, match: Match, offset: int, variables: dict, *, is_key = True) -> Any: """ Replaces a token that is located at "match" param in the "old" string. "new" refers to the object to the partially replaced new value. This will be modified and returned. "offset" refers to the current string offset (used to add and remove string elements in the right place in "new"). "variables" refers to the variables to check against the token for a match. If replacing a key, "is_key" should be true (the behaviour is slightly different) """ ##### Check if escaped should_replace = True match_string = old[match.start():match.end()] pattern = re.compile(token.value.name_reg) label_match = pattern.search(match_string) token_label = match_string[label_match.start(): label_match.end()] if label_match is not None else "" before_match = old[:match.start()] escapes = re.search("\\\\+$", before_match) flags = MapFlags() if escapes is not None: length = escapes.end() - escapes.start() # Escape backslashes (essentially half them) new = new[:escapes.start() + offset] + ("\\" * int(length / 2)) + new[escapes.end() + offset:] offset -= length - int(length / 2) if length % 2 != 0: # odd should_replace = False # Dont replace if ${{}} is escaped ### Replace if should_replace: if token == Token.FUNC: for func in Function: if token_label.find(func.value) + len(func.value) == len(token_label): token_label = func.value break params = [token.value.get_param(match_string, i).strip() for i in range(match_string.count(',') + 1)] if token_label.rfind(Function.BASE64.value) == len(token_label): pass elif token_label == Function.APPEND.value: if len(params) != 1: err(f"The {Function.APPEND.value} function should only have one argument. {len(params)} arguments found...") # // TODO store this information in Functions class, so this can be auto checked for label, var_value in variables.items(): if label == params[0]: flags.append = True new = var_value break elif token == Token.VAR: for label, var_value in variables.items(): var_name = old[match.start() + 3:match.end() - 2].strip() # Remove ${{ and }} // TODO Remove values 3 and -2, and calculate the number of chars until var_name instead if var_name == label: if is_key: # Key of an attribute if type(var_value) != str and not isinstance(var_value, dict): err(f"Cannot substitute the '{label}' in a key! Only strings or maps can be substituted into a key (maps will overwrite the value associated with the key)") if type(var_value) == str: new = new[:match.start() + offset] + var_value + new[match.end() + offset:] offset += len(var_value) - (match.end() - match.start()) elif isinstance(var_value, dict): new = var_value warn(f"The variable {label} is a map, and replaces a key. This appends the variable (map) to the parent map.\nA preferred syntax that leads to the same behaviour is to use '~append$(( {label} ))' as the value and any placeholder as the key (the key will be overwritten so it does not matter)") else: # The value of an attribute if type(var_value) in [str, int, float, bool]: new = new[:match.start() + offset] + str(var_value) + new[match.end() + offset:] offset += len(str(var_value)) - (match.end() - match.start()) elif isinstance(var_value, dict) or type(var_value) == list: new = var_value return new, offset, flags
def __match_regex_in_prefix(self, source: str, match: Match) -> bool: return match and source[match.end()]
def _parse_optional_entities(match: Match, domain: Domain) -> List[Dict[Text, Any]]: """Extracts the optional entity information from the given pattern match. If no entities are specified or if the extraction fails, then an empty list is returned. Args: match: a match produced by `self.pattern` domain: the domain Returns: some list of entities """ entities_str = match.group(ENTITIES) if entities_str is None: return [] try: parsed_entities = json.loads(entities_str) if not isinstance(parsed_entities, dict): raise ValueError( f"Parsed value isn't a json object " f"(instead parser found '{type(parsed_entities)}')") except (JSONDecodeError, ValueError) as e: rasa.shared.utils.io.raise_warning( f"Failed to parse arguments in line '{match.string}'. " f"Failed to decode parameters as a json object (dict). " f"Make sure the intent is followed by a proper json object (dict). " f"Continuing without entities. " f"Error: {e}", docs=DOCS_URL_STORIES, ) parsed_entities = dict() # validate the given entity types entity_types = set(parsed_entities.keys()) unknown_entity_types = entity_types.difference(domain.entities) if unknown_entity_types: rasa.shared.utils.io.raise_warning( f"Failed to parse arguments in line '{match.string}'. " f"Expected entities from {domain.entities} " f"but found {unknown_entity_types}. " f"Continuing without unknown entity types. ", docs=DOCS_URL_STORIES, ) parsed_entities = { key: value for key, value in parsed_entities.items() if key not in unknown_entity_types } # convert them into the list of dictionaries that we expect entities: List[Dict[Text, Any]] = [] for entity_type, entity_values in parsed_entities.items(): if not isinstance(entity_values, list): entity_values = [entity_values] for entity_value in entity_values: entities.append({ ENTITY_ATTRIBUTE_TYPE: entity_type, ENTITY_ATTRIBUTE_VALUE: entity_value, ENTITY_ATTRIBUTE_START: match.start(), ENTITY_ATTRIBUTE_END: match.end(), }) return entities
def del_match(m: Match): """Удаляет совпадение из строки""" return m.string[:m.start()] + m.string[m.end():]
def handleMatch(self, match: Match, data: str): elm = etree.Element("code") elm.set("class", "--markdown-math-escape") elm.text = _encode(r"\(" + match.group(1) + r"\)") return elm, match.start(0), match.end(0)
def _dimension_inside_time(dimension: Match, time: Match) -> bool: is_sub_match = False if dimension.start() >= time.start() and dimension.end() <= time.end(): is_sub_match = True return is_sub_match
def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match], word_positions: List[WordPosition], original_word_sequence: str) -> WordPosition: '''Scan for initial and final consonant, determine position of components in original word sequences''' vowel = vowel_match.group(0) pos_start_vowel = vowel_match.start(0) pos_end_vowel = vowel_match.end(0) word_pos = WordPosition(start=pos_start_vowel, end=pos_end_vowel, start_vowel=pos_start_vowel, end_vowel=pos_end_vowel) final_consonant = None try: test_initial_consonants = sort_longer( POSSIBLE_INITIAL_CONSONANTS[vowel]) except KeyError: test_initial_consonants = () try: prev_word = word_positions[i - 1] start_scan = prev_word.end except IndexError: start_scan = 0 leading_source = original_word_sequence[start_scan:pos_start_vowel] # We determine final consonant first, because some initial consonants are valid or invalid subject to final one. # Look for final consonant try: test_final_consonants = sort_longer(POSSIBLE_FINAL_CONSONANTS[vowel]) except KeyError: # This vowel doesn't need final consonant test_final_consonants = () rest_seq = original_word_sequence[pos_end_vowel:] final_consonant = None # If rest_seq is empty, no need to scan for final consonant if rest_seq: for con in test_final_consonants: if con is None: logger.debug('This vowel "%s" can go without final consonant', vowel) word_pos.end = word_pos.end_vowel break if rest_seq.lower().startswith(con.lower()): # Determined final consonant of this word final_consonant = con word_pos.end = pos_end_vowel + len(final_consonant) if not leading_source: logger.debug("No pool to find initial consonant") break if test_initial_consonants: try: initial_consonant = find_initial_consonant( vowel, final_consonant, leading_source, test_initial_consonants) except ConfusingState: word_pos.start = word_pos.start_vowel - len( leading_source) success = negotiate_expand_consonant( word_pos, word_positions, original_word_sequence) if not success: continue else: break except IllegalCombination: logger.debug( "Illegal combination. Test next possible final consonant." ) continue word_pos.start = pos_start_vowel - len(initial_consonant) break # Not found final consonant if not final_consonant and test_final_consonants and None not in test_final_consonants: logger.error( 'This vowel "%s" needs a final consonant, but could not found in "%s".', vowel, rest_seq) # Even when final consonant is not needed, still need to find initial elif leading_source: try: initial_consonant = find_initial_consonant( vowel, None, leading_source, test_initial_consonants) word_pos.start = pos_start_vowel - len(initial_consonant) except ConfusingState: word_pos.start = word_pos.start_vowel - len(leading_source) negotiate_expand_consonant(word_pos, word_positions, original_word_sequence) elif None not in test_initial_consonants: # This vowel needs initial consonant logger.debug( 'Vowel "%s" needs an initial consonant. Negotiate with precedence word.', vowel) negotiate_expand_consonant(word_pos, word_positions, original_word_sequence) else: logger.debug("Skip finding initial consonant for %s.", vowel_match) # Save position of this word word_positions.append(word_pos) return word_pos
def interpreted_or_phrase_ref(self, match: Match, lineno: int) -> DispatchResult: """Handle :role:`interpreted`, `interpreted`:role: or `phrase ref`_ If interpreted, evoke ``self.interpreted``, or if phrase ref, evoke ``self.self.phrase_ref`` """ end_pattern = self.patterns.interpreted_or_phrase_ref string = match.string matchstart = match.start("backquote") matchend = match.end("backquote") rolestart = match.start("role") role = match.group("role") position = "" if role: role = role[1:-1] position = "prefix" elif self.quoted_start(match): return (string[:matchend], [], string[matchend:], []) endmatch = end_pattern.search(string[matchend:]) if endmatch and endmatch.start(1): # 1 or more chars textend = matchend + endmatch.end() if endmatch.group("role"): if role: msg = self.reporter.warning( "Multiple roles in interpreted text (both " "prefix and suffix present; only one allowed).", line=lineno, ) text = unescape(string[rolestart:textend], True) prb = self.problematic(text, text, msg) return string[:rolestart], [prb], string[textend:], [msg] role = endmatch.group("suffix")[1:-1] position = "suffix" escaped = endmatch.string[: endmatch.start(1)] rawsource = unescape(string[matchstart:textend], True) if rawsource[-1:] == "_": if role: msg = self.reporter.warning( "Mismatch: both interpreted text role %s and " "reference suffix." % position, line=lineno, ) text = unescape(string[rolestart:textend], True) prb = self.problematic(text, text, msg) return string[:rolestart], [prb], string[textend:], [msg] return self.phrase_ref( string[:matchstart], string[textend:], rawsource, escaped, unescape(escaped), ) else: rawsource = unescape(string[rolestart:textend], True) nodelist, messages = self.interpreted(rawsource, escaped, role, lineno) return (string[:rolestart], nodelist, string[textend:], messages) msg = self.reporter.warning( "Inline interpreted text or phrase reference start-string " "without end-string.", line=lineno, ) text = unescape(string[matchstart:matchend], True) prb = self.problematic(text, text, msg) return string[:matchstart], [prb], string[matchend:], [msg]
def _consume_match(self, match: Match, group: Union[int, str] = 0) -> str: if not match: return EMPTY self.advance(match.end(group) - 1 - match.start(group)) return match.group(group)
def process_macro(self, m: Match) -> None: self.emit_segment(m.start()) identifier = m.group(m.lastindex + 1) self._scan_idx = m.end() if not self._conditional.is_active: return macro_start_idx = m.start() if self._src_ref_override: macro_src_ref = self._src_ref_override else: macro_src_ref = SegmentedSourceRef(self._src_seg_map, m.start(), m.end() - 1) # Check if macro identifier is not one of the reserved directives # Preprocessor can end up here if the user did not provide the expected # args to a directive. The parser will instead fall back to thinking it # is a macro expansion reserved_macro_names = ("include", "ifdef", "ifndef", "elsif", "define", "undef", "line") if identifier in reserved_macro_names: self.env.msg.fatal( "Preprocessor directive '`%s' is incomplete" % identifier, self.get_err_src_ref(m.start(), m.end() - 1)) # Lookup macro identifier if identifier not in self._macro_defs: self.env.msg.fatal( "Macro '`%s' has not been defined" % identifier, self.get_err_src_ref(m.start(m.lastindex + 1), m.end(m.lastindex + 1) - 1)) macro = self._macro_defs[identifier] # Scan for macro args if necessary if macro.args: # scan for args argv = self.macro_arg_scanner() # run each argv through the main scanner if self._conditional.is_active: for i, (arg_text, arg_src_ref) in enumerate(argv): vpp = VerilogPreprocessor(self.env, arg_text, src_ref_override=arg_src_ref) vpp._macro_defs = self._macro_defs vpp._active_macro_stack = self._active_macro_stack argv[i], _ = vpp.preprocess() else: argv = [] macro_end_idx = self._scan_idx - 1 # Push current macro into active stack if identifier in self._active_macro_stack: self.env.msg.fatal( "Found recursive macro expansion when processing '`%s'" % identifier, self.get_err_src_ref(m.start(m.lastindex + 1), m.end(m.lastindex + 1) - 1)) self._active_macro_stack.append(identifier) # Emit macro text text = macro.render_macro(self, argv, macro_src_ref) self._output_text_segments.append(text) # If this is the top-level preprocessing pass (and therefore it # has a src seg map), also create a source tracking segment. if self._src_seg_map: text_len = len(text) segment = segment_map.MacroSegment( self._current_output_idx, self._current_output_idx + text_len - 1, macro_start_idx, macro_end_idx, self._src_seg_map) self._output_seg_map.segments.append(segment) self._current_output_idx += text_len # Done processing this macro self._active_macro_stack.pop() self._seg_start_idx = self._scan_idx