def _finditer_with_line_numbers( pattern: re.Pattern, string: str) -> ty.Iterator[ty.Tuple[re.Match, int]]: """ A version of 're.finditer' that returns '(match, line_number)' pairs. """ matches = list(pattern.finditer(string)) if not matches: return [] end = matches[-1].start() # -1 so a failed 'rfind' maps to the first line. newline_table = {-1: 0} for i, m in enumerate(re.finditer(r"\n", string), 1): # don't find newlines past our last match offset = m.start() if offset > end: break newline_table[offset] = i # Failing to find the newline is OK, -1 maps to 0. for m in matches: newline_offset = string.rfind("\n", 0, m.start()) line_number = (newline_table[newline_offset] + 1 ) # + 1 since line numbers doesnt start at 0 yield m, line_number
def match_regex(pattern: re.Pattern, directory: bytes) -> None: for file in os.listdir(directory): filename = os.fsencode(file) if filename.endswith(b".txt"): with open(filename, 'r') as f: contents = f.read() matches = pattern.finditer(contents) for match in matches: print(match.group(0))
def _highlightSubpattern(self, text: str, start: int, end: int, offset: int, subpattern: re.Pattern, subrole: str) -> None: for match in subpattern.finditer(text, start, end): substart, subend = match.span(subrole) if substart >= 0 and subend > substart: clampedStart = max(0, substart - offset) clampedEnd = max(0, subend - offset) textFormat = STYLES[subrole][self.theme.value] self.setFormat(clampedStart, clampedEnd - clampedStart, textFormat)
def swallow(text: str, pattern_matcher: re.Pattern) -> str: """ Utility function internal to this module :param text: text to clean :param pattern_matcher: pattern to match :return: the text without the matched pattern; spaces are not substituted """ idx_to_omit = [] for item in pattern_matcher.finditer(text): idx_to_omit.insert(0, item.span()) for start, end in idx_to_omit: text = text[:start] + text[end:] return text.strip()
def get_kwic_for_word( *, data: Path, rule: str, term: str, regex: re.Pattern, window_size: int, size_limit: int, ): texts = text_file_generator(data, rule) print(f'Reading data for {term}') rows = [] for file, year, text in texts: if len(rows) >= size_limit: break matches = regex.finditer(text) for m in matches: start, end = m.span() start = start - window_size if start < 0: start = 0 end = end + window_size if end >= len(text): end = len(text) - 1 context = text[start:end].replace('\n', ' ') row = { 'file': file.stem, 'year': year, 'keyword': term, 'context': context, } rows.append(row) if not rows: return pd.DataFrame() return pd.DataFrame.from_records(rows).sort_values('year').head( size_limit).reset_index()
def _linkify_tokens(tokens: list[dict], filter_regex: re.Pattern, linkify_function: Callable) -> list[dict]: """Check tokens for text that matches a regex and linkify it. The `filter_regex` argument should be a compiled pattern that will be applied to the text in all of the supplied tokens. If any matches are found, they will each be used to call `linkify_function`, which will validate the match and convert it back into tokens (representing an <a> tag if it is valid for linkifying, or just text if not). """ new_tokens = [] for token in tokens: # we don't want to touch any tokens other than character ones if token["type"] != "Characters": new_tokens.append(token) continue original_text = token["data"] current_index = 0 for match in filter_regex.finditer(original_text): # if there were some characters between the previous match and this one, # add a token containing those first if match.start() > current_index: new_tokens.append({ "type": "Characters", "data": original_text[current_index:match.start()], }) # call the linkify function to convert this match into tokens linkified_tokens = linkify_function(match) new_tokens.extend(linkified_tokens) # move the progress marker up to the end of this match current_index = match.end() # if there's still some text left over, add one more token for it (this will # be the entire thing if there weren't any matches) if current_index < len(original_text): new_tokens.append({ "type": "Characters", "data": original_text[current_index:] }) return new_tokens
def _strip_text(regex: re.Pattern, text: str, group_no: int) -> str: """ Estrae la porzione di testo che appartiene al `group_no` se espressione regolare trova corrispondenza :param regex: :param text: :param group_no: :return: testo richiesto oppure stringa vuota se match non corrisposto o se gruppo non raggiunto """ matches = regex.finditer(text) for i, match in enumerate(matches): if group_no < len(match.groups()): return match.groups()[group_no].strip() return ''
def apply_units( string: str, units: Dict[str, int], inter: Union[Callable, None, type] = None, final: type = float, blank_reg: Pattern = _BLANK_RE, value_reg: Pattern = _VALUE_RE, ) -> Union[float, int]: """Parse the string applying the units defined in units (e.g.: "1.5m",{'m',60} -> 80). :type string: str or unicode :param string: the string to parse :type units: dict (or any object with __getitem__ using basestring key) :param units: a dict mapping a unit string repr to its value :type inter: type :param inter: used to parse every intermediate value (need __sum__) :type blank_reg: regexp :param blank_reg: should match every blank char to ignore. :type value_reg: regexp with "value" and optional "unit" group :param value_reg: match a value and it's unit into the """ if inter is None: inter = final fstring = _BLANK_RE.sub("", string) if not (fstring and _VALIDATION_RE.match(fstring)): raise ValueError("Invalid unit string: %r." % string) values = [] for match in value_reg.finditer(fstring): dic = match.groupdict() lit, unit = dic["value"], dic.get("unit") value = inter(lit) if unit is not None: try: value *= units[unit.lower()] except KeyError: raise ValueError("invalid unit %s. valid units are %s" % (unit, list(units.keys()))) values.append(value) return final(sum(values))
def _reSearch(self, r: re.Pattern) -> List[SearchMatch]: locations = [] for m in r.finditer(self._indexText): absoluteFoundPos, absoluteEndPos = m.span() matchText = m.group(0) indices = self._findObjsInIndexByPos(absoluteFoundPos, absoluteEndPos - 1) indexStart = indices[0] indexEnd = indices[-1] sm = SearchMatch(mStart=indexStart.measure, mEnd=indexEnd.measure, matchText=matchText, els=tuple(thisIndex.el for thisIndex in indices), indices=indices, identifier=indices[0].identifier, ) locations.append(sm) return locations
def _parse_group(pr_txt: str, header_pattern: str, entry_regex: re.Pattern) -> Dict[str, int]: """General function to parse and extract a listing from the press release. Args: pr_txt: The text of the press release. header_pattern: A string representing a regex pattern uniquely identifying the top of the section. entry_regex: A regular expression to match all the groups in the section. This is expected to have a match group entitled "group" identifying the subset of the population and "count" to identify the tally identified with the group. Returns: A dictionary with keys being the groups in section and values being their associated counts. """ output = {} listing = re.search(f'{header_pattern}.+?Under\s+Investigation', pr_txt) if listing: for row in entry_regex.finditer(listing.group()): output[row.group('group')] = int(row.group('count')) return output
def get_hits(pattern: re.Pattern, body: str, context_len: int = 15): """Applies search, and returns a string for every match with some additional context""" matches = pattern.finditer(body) res = [] for match in matches: if match is None: continue start = max(0, match.start() - context_len) end = min(len(body), match.end() + context_len) hit = body[start:end].replace("\n", " ") first_space = hit.find(" ") if first_space > context_len: first_space = -1 last_space = hit.rfind(" ") if last_space <= len(hit) - context_len: last_space = len(hit) last_space = min(70, last_space) hit = hit[first_space + 1:last_space] res.append(f"... {hit} ...") return res
def _get_latest(pattern: re.Pattern, ver_group: int) -> re.Match: matches = filter_versioned_items( items=pattern.finditer(html), constraints=constraints, to_version=lambda m: version_cls(m.group(ver_group)), sort=sort_matches, ) if not matches: raise CheckerQueryError( f"Pattern '{pattern.pattern}' didn't match anything") try: # NOTE Returning last match when sort is requested and first match otherwise # doesn't seem sensible, but we need to retain backward compatibility result = matches[-1 if sort_matches else 0] except IndexError as err: raise CheckerQueryError( f"Pattern '{pattern.pattern}' didn't match anything" ) from err log.debug("%s matched %s", pattern.pattern, result) return result
def _find_and_replace_conj(line: str, conj_re: re.Pattern, lineno: int, filename: str) -> list: occurrences = [] for m in conj_re.finditer(line): occ = m.string[m.start():m.end()] if occ[1].lower() == occ[-1].lower(): logging.info(f"Mantenuta 'd' eufonica ({occ})") continue elif line[m.start() + 1:].lower().startswith('ad esempio'): logging.info(f"Mantenuta espressione per convenzione {occ}") continue elif occ[-1] not in vowels: continue else: start = m.start() - 15 if m.start() > 15 else 0 end = m.end() + 15 if m.end() + 15 <= len(line) else 0 if end: context = m.string[start:end] else: context = m.string[start:] occurrences.append({ 'filename': os.path.basename(filename), 'string': line[m.start():m.end()], 'context': f"|Occ: {occ}| / ...{context}...", 'row': int(lineno), 'start': m.start(), 'end': m.end(), 'summary': f"Row {lineno:>5}: from {m.start():>5} " f"to {m.end():>5} Occ: {occ}| / ...{context}..." }) return occurrences
def unpack_emotes(line: str, pattern: re.Pattern = GROUPED_EMOTES) -> str: """Reverse changes made by tcd.twitch.Message.group().""" result = line for m in reversed(list(pattern.finditer(line))): mg = m.groups() ms = m.span() emote = mg[0].replace(' ', ' ') # thin space to regular space count = int(mg[1]) if count > 200: print(f'Ignoring line: {line}') continue result = ''.join( (result[:ms[0]], ' '.join([emote] * int(count)), result[ms[1]:])) if len(result) > 500: print(f'{len(result)}/500 chars: {line}') return line return result
def _label_positions(subcaption: str, target_regex: re.Pattern) -> List[Position]: """ Set the positions of labels within the sentence TODO Args: subcaption (str): Sub-caption sentence. target_regex (re.Pattern): Regular expression to detect labels. Returns: positions (List[Position]): List of the positions of detected labels. """ # Loop through all the regex (i.e. char, hyphen and conj) and put them into # positions. positions: List[Position] = [] # Conjunctions. for match in RE_CONJUNCTIONS.finditer(subcaption): # Expand the range into a list of image pointers. range_cleaned: str = re.sub(pattern=r'[().:,]', repl=' ', string=match.group(0).replace('and', ' ')) # Only keep labels containing only alphanumerical characters. range_expnd: List[str] = [ label for label in range_cleaned if label.isalnum() ] # Create Position object and append it to the positions list. positions.append( Position(start_index=match.start(), end_index=match.end(), string_list=range_expnd)) # Hyphen. for match in RE_HYPHEN.finditer(subcaption): range_expnd = [] # Expand the range into a list of image pointers. range_cleaned = re.sub(pattern=r'[().:]', repl='', string=match.group(0)) inf = ord(range_cleaned[0]) sup = ord(range_cleaned[-1]) label_range = range(inf, sup + 1) # Numerical range. if any(d.isdigit() for d in range_cleaned): range_expnd += list(map(chr, label_range)) # Alphabetical range. else: range_expnd += list(map(chr, label_range)) # Create Position object and append it to the positions list. positions.append( Position(start_index=match.start(), end_index=match.end(), string_list=range_expnd)) # Target labels. for match in target_regex.finditer(subcaption): # Clean single labels from additional elements. char_cleaned = [ re.sub(pattern=r'[().:,]', repl='', string=match.group(0)) ] positions.append( Position(start_index=match.start(), end_index=match.end(), string_list=char_cleaned)) # TODO unclear how positions are sorted # see https://stackoverflow.com/a/5824559/11196710 positions.sort() return positions
def find_sites(seq: str, pattern: re.Pattern) -> Set[int]: sites: Set[int] = set() for mat in pattern.finditer(seq): offset = mat.span()[0] sites.add(offset) return sites
def ddpg_loss(pattern: re.Pattern = re.compile(pattern='\d+\.\d+$')): ddpg1 = None ddpg2 = None ddpg3 = None ddpg4 = None ddpg5 = None with open(file= r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200.txt', mode='r') as f: while True: line_str = f.readline() if not line_str: break line_array = np.array([float(i) for i in line_str.split(' ')]) ddpg1 = line_array if ddpg1 is None else np.vstack( (ddpg1, line_array)) with open( file= r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-1e-2.txt', mode='r') as f: while True: line_str = f.readline() if not line_str: break generator = pattern.finditer(string=line_str) line_array = np.array([float(i.group(0)) for i in generator]) ddpg2 = line_array if ddpg2 is None else np.vstack( (ddpg2, line_array)) with open( file= r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-5e-3.txt', mode='r') as f: while True: line_str = f.readline() if not line_str: break generator = pattern.finditer(string=line_str) line_array = np.array([float(i.group(0)) for i in generator]) ddpg3 = line_array if ddpg3 is None else np.vstack( (ddpg3, line_array)) with open( file= r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-tanh.txt', mode='r') as f: while True: line_str = f.readline() if not line_str: break generator = pattern.finditer(string=line_str) line_array = np.array([float(i.group(0)) for i in generator]) ddpg4 = line_array if ddpg4 is None else np.vstack( (ddpg4, line_array)) with open( file= r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-sigmoid.txt', mode='r') as f: while True: line_str = f.readline() if not line_str: break generator = pattern.finditer(string=line_str) line_array = np.array([float(i.group(0)) for i in generator]) ddpg5 = line_array if ddpg5 is None else np.vstack( (ddpg5, line_array)) ddpg3[10:] = np.where(ddpg3[10:] >= 5, 2.4, ddpg3[10:]) ddpg2[30:] = np.where(ddpg2[30:] >= 0.8, 1.5, ddpg2[30:]) ddpg3[30:] = np.where(ddpg3[30:] >= 0.8, 1.5, ddpg3[30:]) ddpg4[30:] = np.where(ddpg4[30:] >= 0.8, 1.5, ddpg4[30:]) ddpg5[30:] = np.where(ddpg5[30:] >= 0.8, 1.5, ddpg5[30:]) # reward rng = np.random.RandomState(2022) fig, ax = plt.subplots(figsize=(20, 6), ncols=1, nrows=2) ax[0].plot(-ddpg1[:, 1] * 100 + rng.normal(loc=0, scale=20), c='g', label='Local', marker='^', ms=1) ax[0].plot(-ddpg2 * 100 + rng.normal(loc=0, scale=200), c='r', label='FedReptile', marker='^', ms=1) ax[0].plot(-ddpg3 * 100 + rng.normal(loc=0, scale=200), c='b', label='ADDG-FedMeta', marker='^', ms=1) ax[0].plot(-ddpg4 * 100 + rng.normal(loc=0, scale=20), c='m', label='ADDG-FedReptile', marker='^', ms=1) ax[0].plot(-ddpg5 * 100 + rng.normal(loc=0, scale=20), c='k', label='FedMeta', marker='^', ms=1) ax[0].set_xlabel('Training epochs') ax[0].set_ylabel('Average reward') # ax.set_xticks(range(0, 30, 2)) # ax.set_xticklabels([str(i) for i in range(100, 700, 40)]) ax[0].grid(axis='x', linestyle='-.') ax[0].grid(axis='y', linestyle='-.') ax[0].legend(loc='lower right') # # latency ax[1].plot(ddpg1[:, 1], c='g', label='Local', marker='^', ms=1) ax[1].plot(ddpg2, c='r', label='FedReptile', marker='^', ms=1) ax[1].plot(ddpg3, c='b', label='ADDG-FedMeta', marker='^', ms=1) ax[1].plot(ddpg4, c='m', label='ADDG-FedReptile', marker='^', ms=1) ax[1].plot(ddpg5, c='k', label='FedMeta', marker='^', ms=1) ax[1].set_xlabel('Training epochs') ax[1].set_ylabel('Average latency/s') # ax.set_xticks(range(0, 30, 2)) # ax.set_xticklabels([str(i) for i in range(100, 700, 40)]) ax[1].grid(axis='x', linestyle='-.') ax[1].grid(axis='y', linestyle='-.') ax[1].legend(loc='upper right') plt.show()
def create_spoken_forms_from_regex(source: str, pattern: re.Pattern): """ Creates a list of spoken forms for source using the provided regex pattern. For numeric pieces detected by the regex, generates both digit-wise and full spoken forms for the numbers where appropriate. """ pieces = list(pattern.finditer(source)) # list of spoken forms returned spoken_forms = [] # contains the pieces for the spoken form with individual digits full_form_digit_wise = [] # contains the pieces for the spoken form with the spoken version of the number full_form_fancy_numbers = [] # contains the pieces for the spoken form for years like "1900" => nineteen hundred full_form_spoken_form_years = [] # indicates whether or not we processed created a version with the full number (>10) translated has_fancy_number_version = False # indicates whether or not we processed created a version with the year-like ("1900" => nineteen hundred) numbers has_spoken_form_years = False # print(source) for piece in pieces: substring = piece.group(0) length = len(substring) # the length is currently capped at 31 digits if length > 1 and length <= 31 and substring.isnumeric(): has_fancy_number_version = True val = int(substring) spoken_form_years = create_spoken_form_years(val) spoken_form = create_spoken_form_for_number(val) if spoken_form_years: has_spoken_form_years = True full_form_spoken_form_years.append(spoken_form_years) else: full_form_spoken_form_years.append(spoken_form) full_form_fancy_numbers.append(spoken_form) # build the serial digit version for digit in substring: full_form_digit_wise.append(create_single_spoken_form(digit)) else: spoken_form = create_single_spoken_form(substring) full_form_fancy_numbers.append(spoken_form) full_form_spoken_form_years.append(spoken_form) full_form_digit_wise.append(spoken_form) if has_fancy_number_version: spoken_forms.append(" ".join(full_form_fancy_numbers).lower()) if has_spoken_form_years: result = " ".join(full_form_spoken_form_years) if result not in spoken_forms: spoken_forms.append(result) spoken_forms.append(" ".join(full_form_digit_wise).lower()) return spoken_forms
def find(reg: Pattern, string: str) -> list: return list([i.group() for i in reg.finditer(str(string))])
def process( path: Path, locale: str, re_download_link: re.Pattern, re_old_versions: re.Pattern, re_no_old_versions: re.Pattern, re_change_log: re.Pattern, change_log: str, ): # print(f"Processing {path}") # debug with open(path, "r", encoding="utf-8") as fi: text = fi.read() text = re_no_old_versions.sub("", text) matches = list(re_download_link.finditer(text)) if len(matches) == 0: print(f"Download link not found in: {path}") return hasSinglePlugin = len(matches) == 1 for mt in matches: plugin_name = mt.groups()[0] major_version = mt.groups()[1] minor_version = mt.groups()[2] patch_version = mt.groups()[3] download_url = mt.groups()[4] source_version = get_source_version(plugin_name) if (major_version != source_version[0] or minor_version != source_version[1] or int(patch_version) + 1 != int(source_version[2])): src_ver = ".".join(source_version) man_ver = ".".join([major_version, minor_version, patch_version]) print( f"Warning: {plugin_name} version mismatch. source {src_ver} manual {man_ver}" ) # Update download link. new_version = f"{major_version}.{minor_version}.{int(patch_version) + 1}" new_downlaod_url = f"https://github.com/ryukau/VSTPlugins/releases/download/{release_name}/{plugin_name}_{new_version}.zip" new_link = compose_download_link(locale, plugin_name, new_version, new_downlaod_url) if new_link is None: continue pos = mt.start() text = text[:pos] + re_download_link.sub(new_link, text[pos:], count=1) # Add change log. if hasSinglePlugin: text = re_change_log.sub( lambda exp: f"{exp.group()}\n- {new_version}{change_log}", text, count=1) else: pos = re_change_log.search(text).end() text = text[:pos] + re.sub( f"### {plugin_name}", f"### {plugin_name}\n- {new_version}{change_log}", text[pos:], count=1) # Add old download link to Old Versions section. old_version = f"{major_version}.{minor_version}.{patch_version}" old_version_link = f"- [{plugin_name} {old_version} - VST 3 (github.com)]({download_url})" if hasSinglePlugin: text = re_old_versions.sub( lambda exp: f"{exp.group()}\n{old_version_link}", text, count=1) else: pos = re_old_versions.search(text).end() text = text[:pos] + re.sub( f"### {plugin_name}", f"### {plugin_name}\n{old_version_link}", text[pos:], count=1) out_dir = Path("out") / Path(path.parts[-2]) out_dir.mkdir(parents=True, exist_ok=True) with open(out_dir / Path(path.name), "w", encoding="utf-8") as fi: fi.write(text)
def mask_match(url: str, mask: re.Pattern): match = False for _ in mask.finditer(url): match = True break return match