def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): shard = int(match.groupdict() ["shard"]) if "shard" in match.groupdict().keys() else None event_filter = event_registry.get_registry_filter() event_filter \ .filter_by_node(node=node) \ .filter_by_type(event_type=event_type) \ .filter_by_period(period_type=EventPeriod.BEGIN.value) if shard is not None: event_filter.filter_by_shard(shard) begun_events = event_filter.get_filtered() if not begun_events: raise ContinuousEventRegistryException( "Did not find any events of type {event_type}" "with period type {period_type}.".format( event_type=event_type, period_type=EventPeriod.BEGIN.value)) if len(begun_events) > 1: LOGGER.warning( "Found {event_count} events of type {event_type} with period {event_period}. " "Will apply the function to most recent event by default.". format(event_count=len(begun_events), event_type=event_type, event_period=EventPeriod.BEGIN.value)) event = begun_events[-1] event.end_event()
def convert_entity(m: Match) -> str: groups = m.groupdict() number = None if groups.get("dec"): number = int(groups["dec"], 10) elif groups.get("hex"): number = int(groups["hex"], 16) elif groups.get("named"): entity_name = groups["named"] if entity_name.lower() in keep: return m.group(0) else: number = name2codepoint.get(entity_name) or name2codepoint.get( entity_name.lower()) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML try: if 0x80 <= number <= 0x9F: return bytes((number, )).decode("cp1252") else: return chr(number) except ValueError: pass return "" if remove_illegal and groups.get("semicolon") else m.group(0)
def handle_basic(self, source: str, match: Match) -> Generator[List[Content], None, None]: """Handles ``basic`` links to imgur media. Args: source (str): The source url match (Match): The source match regex Yields: list[Content]: A list of various levels of quality content for \ the same source url """ data = self._get_data(match.groupdict()["id"]) content_list = [] for url_type in self._content_urls: if url_type in data: content_list.append( Content( uid=f'{self.name}-{data["id"]}-{url_type}', source=source, fragments=[data[url_type]], extractor=self, extension=data[url_type].split(".")[-1], title=data.get("title"), description=data.get("description"), quality=self._quality_map.get(url_type, 0.0), uploaded_by=data.get("account_id"), uploaded_date=datetime.datetime.fromtimestamp( int(data.get("datetime"))), metadata=data, )) yield content_list
def inner(match: Match): groups = match.groupdict() identifier = groups["identifier"] title = groups["title"] if title and not identifier: identifier, title = title, identifier try: url = relative_url(from_url, url_map[identifier]) except KeyError: if " " in identifier or "/" in identifier: # invalid identifier, must not be a intended reference unintended.append(identifier) else: unmapped.append(identifier) if not title: return f"[{identifier}][]" return f"[{title}][{identifier}]" # TODO: we could also use a config option to ignore some identifiers # and to map others to URLs, something like: # references: # ignore: # - "USERNAME:PASSWORD@" # map: # some-id: https://example.com return f'<a href="{url}">{title or identifier}</a>'
def handle_raw(self, source: str, match: Match) -> Generator[List[Content], None, None]: """Handles ``raw`` links to 4chan media. Args: source (str): The source url match (Match): The source match regex Yields: list[Content]: A list of various levels of quality content for \ the same source url """ matchdict = match.groupdict() content_list = [] for (post_type, url_path, quality) in self._content_configs: content_list.append( Content( uid= (f'{self.name}-{matchdict["board"]}-raw-{matchdict["id"]}' f"{post_type}"), source=source, fragments=[source], extractor=self, title=None, description=None, quality=quality, uploaded_by=None, uploaded_date=None, metadata=None, )) yield content_list
def _add_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() if "shard" in kwargs: kwargs["shard"] = int(kwargs["shard"]) new_event = event_type(node=node, **kwargs) new_event.begin_event()
def _extract_entity_attributes(self, match: Match) -> EntityAttributes: """Extract the entity attributes, i.e. type, value, etc., from the regex match.""" entity_text = match.groupdict()[GROUP_ENTITY_TEXT] if match.groupdict()[GROUP_ENTITY_DICT]: return self._extract_entity_attributes_from_dict(entity_text, match) entity_type = match.groupdict()[GROUP_ENTITY_TYPE] if match.groupdict()[GROUP_ENTITY_VALUE]: entity_value = match.groupdict()[GROUP_ENTITY_VALUE] self._deprecated_synonym_format_was_used = True else: entity_value = entity_text return EntityAttributes(entity_type, entity_value, entity_text, None, None)
def extract(self, match: Match) -> Link: groups = match.groupdict() return Link( image=groups.get('md_is_image') or groups.get('md_alt_is_image') or '', text=groups.get('md_text') or '', target=groups.get('md_filename') or '', title=groups.get('md_title') or '', anchor=groups.get('md_anchor') or '' )
def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() continuous_hash = event_type.get_continuous_hash_from_dict({ 'node': node, **kwargs }) if begin_event := event_registry.find_continuous_events_by_hash( continuous_hash): begin_event[-1].end_event() return
def _getMatchingFirstHists(self, match: Match) -> List[Tuple[str]]: firstmatches = [] for tup in self.hits[0]: # does the tuple match in all spots where the pattern name matches, and # where the variable is significant (we would output a different plot)? # make sure we handle the 1-indexing of regex matches if all(tup[self.regexgroups[0][_[0]] - 1] == _[1] for _ in match.groupdict().items() if _[0] in self.outputnames): firstmatches.append(tup) return firstmatches
def transformation(match: Match, s=spec.decode(self.codec)): symb: dict = match.groupdict() args: list = [match.group(0), *match.groups()] used = set() item = meta.format(s, self.codec, args, symb, True, True, used) for variable in used: symb.pop(variable, None) symb.update(offset=match.start()) for name, value in meta.items(): symb.setdefault(name, value) return self.labelled(item, **symb)
def callback(match: Match) -> str: m = match.groupdict() if m["userid"]: user = self.users.get(m["userid"], None) if user is None: username = m["userid"] else: username = user.name elif m["alias"]: username = m["alias"] return f"{prefix}{username}"
async def execute(self, evt: MessageEvent, match: Match) -> None: extra_vars = { **{str(i): val for i, val in enumerate(match.groups())}, **match.groupdict(), } content = self.template.execute(evt=evt, rule_vars=self.variables, extra_vars=extra_vars) await evt.client.send_message_event(evt.room_id, self.type or self.template.type, content)
def _convert_match(self, match: Match) -> 'RelativeDeltaParams': groups = match.groupdict() try: meridiem = groups.pop("meridiem").lower() except (KeyError, AttributeError): meridiem = None params = self._convert_groups(groups) if meridiem == "pm": params["hour"] += 12 elif meridiem == "am" and params["hour"] == 12: params["hour"] = 0 return params
def _extract_entity_attributes_from_dict(self, entity_text: Text, match: Match) -> EntityAttributes: """Extract the entity attributes from the dict format.""" entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT] entity_dict = self._get_validated_dict(entity_dict_str) return EntityAttributes( entity_dict.get(ENTITY_ATTRIBUTE_TYPE), entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text), entity_text, entity_dict.get(ENTITY_ATTRIBUTE_GROUP), entity_dict.get(ENTITY_ATTRIBUTE_ROLE), )
def handle_thread(self, source: str, match: Match) -> Generator[List[Content], None, None]: """Handles ``thread`` links to 4chan media. Args: source (str): The source url match (Match): The source match regex Yields: list[Content]: A list of various levels of quality content for \ the same source url """ matchdict = match.groupdict() data = self._get_data(matchdict["board"], matchdict["id"]) for post in data.get("posts", []): if "md5" in post: content_list = [] for (post_type, url_path, quality, extension_type) in self._content_configs: # build post_type depending on existing post_type post_type = (f"-{post_type}" if post_type else "") content_uid = ( f'{self.name}-{matchdict["board"]}-' f'{matchdict["id"]}-{post["tim"]}{post_type}') content_fragments = [ furl(self._img_base).add(path=url_path.format( board=matchdict["board"], post=post)).url ] content_extension = (extension_type if extension_type else post["ext"].split(".")[-1]) content_description = None if "com" in post and len(post["com"]) > 0: content_description = HTML(html=post.get("com")).text content_list.append( Content( uid=content_uid, source=source, fragments=content_fragments, extractor=self, extension=content_extension, title=post.get("filename"), description=content_description, quality=quality, uploaded_by=post.get("name"), uploaded_date=datetime.datetime.fromtimestamp( int(post.get("time"))), metadata=post, )) yield content_list
def extract_entity_attributes(match: Match) -> EntityAttributes: """Extract the entity attributes, i.e. type, value, etc., from the regex match. Args: match: Regex match to extract the entity attributes from. Returns: EntityAttributes object. """ entity_text = match.groupdict()[GROUP_ENTITY_TEXT] if match.groupdict()[GROUP_ENTITY_DICT]: return extract_entity_attributes_from_dict(entity_text, match) entity_type = match.groupdict()[GROUP_ENTITY_TYPE] if match.groupdict()[GROUP_ENTITY_VALUE]: entity_value = match.groupdict()[GROUP_ENTITY_VALUE] else: entity_value = entity_text return EntityAttributes(entity_type, entity_value, entity_text, None, None)
def _get_numbered_group(match: typing.Match, name: str, start: int = 0) -> typing.Optional[str]: groups = match.groupdict() idx = start while True: key = f'{name}{idx}' if key not in groups: return None if groups[key] is not None: return groups[key] idx += 1 return None
def from_match(cls, match: Match, mail_dtime: datetime, ttype: TransType) -> 'Transaction': groups_dict = match.groupdict() transaction = cls.__new__(cls) kwargs = { 'ttype': ttype, 'mail_dtime': mail_dtime, } if 'nat' in groups_dict: kwargs['amount'] = float(groups_dict['nat'].replace('.', '').replace(',', '.')) if 'int' in groups_dict: kwargs['int_amount'] = float(groups_dict['int'].replace('.', '').replace(',', '.')) if 'rate' in groups_dict: kwargs['rate'] = float(groups_dict['rate'].replace('.', '').replace(',', '.')) transaction.__init__(**kwargs) return transaction
def format_struct(match: Match) -> str: db_name = match.group('db_name') try: if DATA_TYPES[db_name] is datetime and db_name != 'date': db_name = f"SUBSTRING(c.{db_name}, 0, 10)" else: db_name = f"c.{db_name}" params = {**match.groupdict(), 'db_name': f"({db_name} ?? null)"} return template.substitute(**params) except KeyError: raise InvalidStructureParameter(name=match.group('db_name'), structure_format=template_format)
def get_named_group_index(match: typing.Match, name: str) -> typing.Optional[int]: """Get the index of the named group Args: match (Match): The regex match name (str): The group name Returns: int: The index of the group """ if name in match.groupdict(): span = match.span(name) for i in range(1, len(match.groups()) + 1): if match.span(i) == span: return i return None
def get_named_group_at_index(match: typing.Match, idx: int) -> typing.Optional[str]: """Get the name of the group Args: match (Match): The regex match idx (int): The group index Returns: str: The group name """ if len(match.groups()) >= idx: span = match.span(idx) for group in match.groupdict(): if match.span(group) == span: return group return None
def _content(self, content: Content, match: Match) -> Tuple[Union[str, None], str, Any]: filename = None line_number = "0" obj = None self._logger.debug("content is showing") something = match.groupdict()["something"] if something: self._logger.debug("asked to open something") if something.startswith("{{"): self._logger.debug("something appears to be a template: %s", something) templated = templar(something, content.showing) if isinstance(templated, str): parts = templated.rsplit(":", 1) if os.path.isfile(parts[0]): filename = parts[0] line_number = parts[1:][0] if parts[1:] else line_number self._logger.debug( "template interaction in valid filename %s:%s", filename, line_number) else: self._logger.debug( "template not a valid filename, open showing") obj = templated else: self._logger.debug("template not a string, open showing") obj = templated else: parts = something.rsplit(":", 1) if os.path.isfile(parts[0]): filename = parts[0] line_number = parts[1:][0] if parts[1:] else line_number self._logger.debug( "something not a template, but is a valid filename %s:%s", filename, line_number, ) else: self._logger.debug("something just a plain string") obj = something else: self._logger.debug("something not provided") obj = content.showing return filename, line_number, obj
def get_named_group_index_dict(match: typing.Match) -> typing.Dict[int, str]: """Get the name/index map of the groups Args: match (Match): The regex match Returns: dict: A mapping of indices to names """ group_idx_to_name = {} for group in match.groupdict(): span = match.span(group) for i in range(1, len(match.groups()) + 1): if match.span(i) == span: group_idx_to_name[i] = group break return group_idx_to_name
def _end_event(event_type: Type[ScyllaDatabaseContinuousEvent], match: Match): kwargs = match.groupdict() event_filter = event_registry.get_registry_filter() event_filter \ .filter_by_node(node=node) \ .filter_by_type(event_type=event_type) \ .filter_by_period(period_type=EventPeriod.BEGIN.value) if kwargs.get("shard"): event_filter.filter_by_shard(int(kwargs["shard"])) if kwargs.get("table"): event_filter.filter_by_attr(base="CompactionEvent", table=kwargs["table"]) if kwargs.get("compaction_process_id"): event_filter.filter_by_attr( base="CompactionEvent", compaction_process_id=kwargs["compaction_process_id"]) begun_events = event_filter.get_filtered() if not begun_events: TestFrameworkEvent( source=event_type.__name__, message="Did not find any events of type {event_type}" " with period type {period_type}, event data: {event_data}". format( event_type=event_type, period_type=EventPeriod.BEGIN.value, event_data=kwargs, ), severity=Severity.ERROR).publish_or_dump() return if len(begun_events) > 1: LOGGER.debug( "Found %s events of type %s with period %s. " "Will apply the function to most recent event by default.", len(begun_events), event_type, EventPeriod.BEGIN.value) event = begun_events[-1] event.end_event()
def inner(match: Match): # noqa: WPS430 (nested function, no other way than side-effecting the warnings) groups = match.groupdict() identifier = groups["identifier"] title = groups["title"] if title and not identifier: identifier, title = title, identifier try: url = relative_url(from_url, url_map[identifier]) except KeyError: if " " not in identifier and "/" not in identifier: unmapped.append(identifier) if not title: return f"[{identifier}][]" return f"[{title}][{identifier}]" return f'<a href="{url}">{title or identifier}</a>'
def get_named_group_index_list(match: typing.Match) -> typing.List[typing.Optional[str]]: """Get the names of the groups Args: match (Match): The regex match Returns: list: The names of the groups by index """ group_names: typing.List[typing.Optional[str]] = [None] * (len(match.groups()) + 1) for i in range(1, len(match.groups()) + 1): span = match.span(i) for group in match.groupdict(): if match.span(group) == span: group_names[i] = group break return group_names
def match_to_datetime(match: Match) -> datetime: D = match.groupdict() dt_kwargs = { grp: D.get(grp) for grp in DATETIME_MATCH_GROUPS if D.get(grp) } if "year" in dt_kwargs: if dt_kwargs["year"] == 2: dt_kwargs["year"] = datetime.strptime(dt_kwargs["year"], "%y").year else: dt_kwargs["year"] = int(dt_kwargs["year"], base=10) for k, v in dt_kwargs.items(): if isinstance(v, str): dt_kwargs[k] = int(v, base=10) dt = datetime(**dt_kwargs, tzinfo=UTC) ofs = D.get("offset_hours") if ofs: dt -= timedelta(hours=ofs) return dt
def extract_entity_attributes_from_dict(entity_text: Text, match: Match) -> EntityAttributes: """Extract entity attributes from dict format. Args: entity_text: Original entity text. match: Regex match. Returns: Extracted entity attributes. """ entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT] entity_dict = get_validated_dict(entity_dict_str) return EntityAttributes( entity_dict.get(ENTITY_ATTRIBUTE_TYPE), entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text), entity_text, entity_dict.get(ENTITY_ATTRIBUTE_GROUP), entity_dict.get(ENTITY_ATTRIBUTE_ROLE), )
def extract(self, match: Match) -> Link: groups = match.groupdict() image = groups.get('wiki_is_image') or '' link = groups.get('wiki_link') or '' anchor = groups.get('wiki_anchor') or '' text = groups.get('wiki_text') or link or anchor if not (link or text or anchor): raise BrokenLink( f"Could not extract required field `wiki_link` from {match.group(0)}" ) link = self._slugify(link) if anchor: anchor = self._slugify(anchor) return Link(image=image, text=text, target=link, title=text, anchor=anchor)