Пример #1
0
def _finditer_with_line_numbers(
        pattern: re.Pattern,
        string: str) -> ty.Iterator[ty.Tuple[re.Match, int]]:
    """
    A version of 're.finditer' that returns '(match, line_number)' pairs.
    """

    matches = list(pattern.finditer(string))
    if not matches:
        return []

    end = matches[-1].start()
    # -1 so a failed 'rfind' maps to the first line.
    newline_table = {-1: 0}
    for i, m in enumerate(re.finditer(r"\n", string), 1):
        # don't find newlines past our last match
        offset = m.start()
        if offset > end:
            break
        newline_table[offset] = i

    # Failing to find the newline is OK, -1 maps to 0.
    for m in matches:
        newline_offset = string.rfind("\n", 0, m.start())
        line_number = (newline_table[newline_offset] + 1
                       )  # + 1 since line numbers doesnt start at 0
        yield m, line_number
Пример #2
0
def match_regex(pattern: re.Pattern, directory: bytes) -> None:
    for file in os.listdir(directory):
        filename = os.fsencode(file)
        if filename.endswith(b".txt"):
            with open(filename, 'r') as f:
                contents = f.read()
                matches = pattern.finditer(contents)
                for match in matches:
                    print(match.group(0))
Пример #3
0
 def _highlightSubpattern(self, text: str, start: int, end: int,
                          offset: int, subpattern: re.Pattern,
                          subrole: str) -> None:
     for match in subpattern.finditer(text, start, end):
         substart, subend = match.span(subrole)
         if substart >= 0 and subend > substart:
             clampedStart = max(0, substart - offset)
             clampedEnd = max(0, subend - offset)
             textFormat = STYLES[subrole][self.theme.value]
             self.setFormat(clampedStart, clampedEnd - clampedStart,
                            textFormat)
Пример #4
0
def swallow(text: str, pattern_matcher: re.Pattern) -> str:
    """
    Utility function internal to this module

    :param text: text to clean
    :param pattern_matcher: pattern to match
    :return: the text without the matched pattern; spaces are not substituted
    """
    idx_to_omit = []
    for item in pattern_matcher.finditer(text):
        idx_to_omit.insert(0, item.span())
    for start, end in idx_to_omit:
        text = text[:start] + text[end:]
    return text.strip()
Пример #5
0
def get_kwic_for_word(
    *,
    data: Path,
    rule: str,
    term: str,
    regex: re.Pattern,
    window_size: int,
    size_limit: int,
):
    texts = text_file_generator(data, rule)

    print(f'Reading data for {term}')

    rows = []

    for file, year, text in texts:

        if len(rows) >= size_limit:
            break

        matches = regex.finditer(text)

        for m in matches:
            start, end = m.span()
            start = start - window_size
            if start < 0:
                start = 0

            end = end + window_size

            if end >= len(text):
                end = len(text) - 1

            context = text[start:end].replace('\n', ' ')

            row = {
                'file': file.stem,
                'year': year,
                'keyword': term,
                'context': context,
            }

            rows.append(row)

    if not rows:
        return pd.DataFrame()

    return pd.DataFrame.from_records(rows).sort_values('year').head(
        size_limit).reset_index()
Пример #6
0
    def _linkify_tokens(tokens: list[dict], filter_regex: re.Pattern,
                        linkify_function: Callable) -> list[dict]:
        """Check tokens for text that matches a regex and linkify it.

        The `filter_regex` argument should be a compiled pattern that will be applied to
        the text in all of the supplied tokens. If any matches are found, they will each
        be used to call `linkify_function`, which will validate the match and convert it
        back into tokens (representing an <a> tag if it is valid for linkifying, or just
        text if not).
        """
        new_tokens = []

        for token in tokens:
            # we don't want to touch any tokens other than character ones
            if token["type"] != "Characters":
                new_tokens.append(token)
                continue

            original_text = token["data"]
            current_index = 0

            for match in filter_regex.finditer(original_text):
                # if there were some characters between the previous match and this one,
                # add a token containing those first
                if match.start() > current_index:
                    new_tokens.append({
                        "type":
                        "Characters",
                        "data":
                        original_text[current_index:match.start()],
                    })

                # call the linkify function to convert this match into tokens
                linkified_tokens = linkify_function(match)
                new_tokens.extend(linkified_tokens)

                # move the progress marker up to the end of this match
                current_index = match.end()

            # if there's still some text left over, add one more token for it (this will
            # be the entire thing if there weren't any matches)
            if current_index < len(original_text):
                new_tokens.append({
                    "type": "Characters",
                    "data": original_text[current_index:]
                })

        return new_tokens
Пример #7
0
def _strip_text(regex: re.Pattern, text: str, group_no: int) -> str:
    """
    Estrae la porzione di testo che appartiene al `group_no` se
    espressione regolare trova corrispondenza

    :param regex:
    :param text:
    :param group_no:
    :return: testo richiesto oppure stringa vuota se match non corrisposto
             o se gruppo non raggiunto
    """
    matches = regex.finditer(text)
    for i, match in enumerate(matches):
        if group_no < len(match.groups()):
            return match.groups()[group_no].strip()
    return ''
Пример #8
0
def apply_units(
    string: str,
    units: Dict[str, int],
    inter: Union[Callable, None, type] = None,
    final: type = float,
    blank_reg: Pattern = _BLANK_RE,
    value_reg: Pattern = _VALUE_RE,
) -> Union[float, int]:
    """Parse the string applying the units defined in units
    (e.g.: "1.5m",{'m',60} -> 80).

    :type string: str or unicode
    :param string: the string to parse

    :type units: dict (or any object with __getitem__ using basestring key)
    :param units: a dict mapping a unit string repr to its value

    :type inter: type
    :param inter: used to parse every intermediate value (need __sum__)

    :type blank_reg: regexp
    :param blank_reg: should match every blank char to ignore.

    :type value_reg: regexp with "value" and optional "unit" group
    :param value_reg: match a value and it's unit into the
    """
    if inter is None:
        inter = final
    fstring = _BLANK_RE.sub("", string)
    if not (fstring and _VALIDATION_RE.match(fstring)):
        raise ValueError("Invalid unit string: %r." % string)
    values = []
    for match in value_reg.finditer(fstring):
        dic = match.groupdict()
        lit, unit = dic["value"], dic.get("unit")
        value = inter(lit)
        if unit is not None:
            try:
                value *= units[unit.lower()]
            except KeyError:
                raise ValueError("invalid unit %s. valid units are %s" %
                                 (unit, list(units.keys())))
        values.append(value)
    return final(sum(values))
Пример #9
0
    def _reSearch(self, r: re.Pattern) -> List[SearchMatch]:
        locations = []
        for m in r.finditer(self._indexText):
            absoluteFoundPos, absoluteEndPos = m.span()
            matchText = m.group(0)

            indices = self._findObjsInIndexByPos(absoluteFoundPos, absoluteEndPos - 1)
            indexStart = indices[0]
            indexEnd = indices[-1]

            sm = SearchMatch(mStart=indexStart.measure,
                             mEnd=indexEnd.measure,
                             matchText=matchText,
                             els=tuple(thisIndex.el for thisIndex in indices),
                             indices=indices,
                             identifier=indices[0].identifier,
                             )
            locations.append(sm)
        return locations
Пример #10
0
def _parse_group(pr_txt: str, header_pattern: str,
                 entry_regex: re.Pattern) -> Dict[str, int]:
    """General function to parse and extract a listing from the press release.
    Args:
        pr_txt: The text of the press release.
        header_pattern: A string representing a regex pattern uniquely
            identifying the top of the section.
        entry_regex: A regular expression to match all the groups in the
            section. This is expected to have a match group entitled "group"
            identifying the subset of the population and "count" to identify
            the tally identified with the group.
    Returns:
        A dictionary with keys being the groups in section and values being
        their associated counts.
    """
    output = {}
    listing = re.search(f'{header_pattern}.+?Under\s+Investigation', pr_txt)
    if listing:
        for row in entry_regex.finditer(listing.group()):
            output[row.group('group')] = int(row.group('count'))
    return output
Пример #11
0
def get_hits(pattern: re.Pattern, body: str, context_len: int = 15):
    """Applies search, and returns a string for every match with some
    additional context"""
    matches = pattern.finditer(body)
    res = []
    for match in matches:
        if match is None:
            continue
        start = max(0, match.start() - context_len)
        end = min(len(body), match.end() + context_len)
        hit = body[start:end].replace("\n", " ")
        first_space = hit.find(" ")
        if first_space > context_len:
            first_space = -1
        last_space = hit.rfind(" ")
        if last_space <= len(hit) - context_len:
            last_space = len(hit)
        last_space = min(70, last_space)
        hit = hit[first_space + 1:last_space]
        res.append(f"... {hit} ...")
    return res
        def _get_latest(pattern: re.Pattern, ver_group: int) -> re.Match:
            matches = filter_versioned_items(
                items=pattern.finditer(html),
                constraints=constraints,
                to_version=lambda m: version_cls(m.group(ver_group)),
                sort=sort_matches,
            )
            if not matches:
                raise CheckerQueryError(
                    f"Pattern '{pattern.pattern}' didn't match anything")

            try:
                # NOTE Returning last match when sort is requested and first match otherwise
                # doesn't seem sensible, but we need to retain backward compatibility
                result = matches[-1 if sort_matches else 0]
            except IndexError as err:
                raise CheckerQueryError(
                    f"Pattern '{pattern.pattern}' didn't match anything"
                ) from err

            log.debug("%s matched %s", pattern.pattern, result)
            return result
Пример #13
0
def _find_and_replace_conj(line: str, conj_re: re.Pattern, lineno: int,
                           filename: str) -> list:
    occurrences = []
    for m in conj_re.finditer(line):
        occ = m.string[m.start():m.end()]
        if occ[1].lower() == occ[-1].lower():
            logging.info(f"Mantenuta 'd' eufonica ({occ})")
            continue
        elif line[m.start() + 1:].lower().startswith('ad esempio'):
            logging.info(f"Mantenuta espressione per convenzione {occ}")
            continue
        elif occ[-1] not in vowels:
            continue
        else:
            start = m.start() - 15 if m.start() > 15 else 0
            end = m.end() + 15 if m.end() + 15 <= len(line) else 0
            if end:
                context = m.string[start:end]
            else:
                context = m.string[start:]
            occurrences.append({
                'filename':
                os.path.basename(filename),
                'string':
                line[m.start():m.end()],
                'context':
                f"|Occ: {occ}| / ...{context}...",
                'row':
                int(lineno),
                'start':
                m.start(),
                'end':
                m.end(),
                'summary':
                f"Row {lineno:>5}: from {m.start():>5} "
                f"to {m.end():>5} Occ: {occ}| / ...{context}..."
            })
    return occurrences
Пример #14
0
def unpack_emotes(line: str, pattern: re.Pattern = GROUPED_EMOTES) -> str:
    """Reverse changes made by tcd.twitch.Message.group()."""
    result = line

    for m in reversed(list(pattern.finditer(line))):
        mg = m.groups()
        ms = m.span()

        emote = mg[0].replace(' ', ' ')  # thin space to regular space
        count = int(mg[1])

        if count > 200:
            print(f'Ignoring line: {line}')
            continue

        result = ''.join(
            (result[:ms[0]], ' '.join([emote] * int(count)), result[ms[1]:]))

        if len(result) > 500:
            print(f'{len(result)}/500 chars: {line}')
            return line

    return result
Пример #15
0
def _label_positions(subcaption: str,
                     target_regex: re.Pattern) -> List[Position]:
    """
    Set the positions of labels within the sentence
    TODO

    Args:
        subcaption (str):           Sub-caption sentence.
        target_regex (re.Pattern):  Regular expression to detect labels.

    Returns:
        positions (List[Position]): List of the positions of detected labels.
    """
    # Loop through all the regex (i.e. char, hyphen and conj) and put them into
    # positions.
    positions: List[Position] = []

    # Conjunctions.
    for match in RE_CONJUNCTIONS.finditer(subcaption):
        # Expand the range into a list of image pointers.
        range_cleaned: str = re.sub(pattern=r'[().:,]',
                                    repl=' ',
                                    string=match.group(0).replace('and', ' '))

        # Only keep labels containing only alphanumerical characters.
        range_expnd: List[str] = [
            label for label in range_cleaned if label.isalnum()
        ]

        # Create Position object and append it to the positions list.
        positions.append(
            Position(start_index=match.start(),
                     end_index=match.end(),
                     string_list=range_expnd))

    # Hyphen.
    for match in RE_HYPHEN.finditer(subcaption):
        range_expnd = []
        # Expand the range into a list of image pointers.
        range_cleaned = re.sub(pattern=r'[().:]',
                               repl='',
                               string=match.group(0))

        inf = ord(range_cleaned[0])
        sup = ord(range_cleaned[-1])
        label_range = range(inf, sup + 1)

        # Numerical range.
        if any(d.isdigit() for d in range_cleaned):
            range_expnd += list(map(chr, label_range))

        # Alphabetical range.
        else:
            range_expnd += list(map(chr, label_range))

        # Create Position object and append it to the positions list.
        positions.append(
            Position(start_index=match.start(),
                     end_index=match.end(),
                     string_list=range_expnd))

    # Target labels.
    for match in target_regex.finditer(subcaption):

        # Clean single labels from additional elements.
        char_cleaned = [
            re.sub(pattern=r'[().:,]', repl='', string=match.group(0))
        ]

        positions.append(
            Position(start_index=match.start(),
                     end_index=match.end(),
                     string_list=char_cleaned))

    # TODO unclear how positions are sorted
    # see https://stackoverflow.com/a/5824559/11196710
    positions.sort()

    return positions
Пример #16
0
def find_sites(seq: str, pattern: re.Pattern) -> Set[int]:
    sites: Set[int] = set()
    for mat in pattern.finditer(seq):
        offset = mat.span()[0]
        sites.add(offset)
    return sites
Пример #17
0
def ddpg_loss(pattern: re.Pattern = re.compile(pattern='\d+\.\d+$')):
    ddpg1 = None
    ddpg2 = None
    ddpg3 = None
    ddpg4 = None
    ddpg5 = None
    with open(file=
              r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200.txt',
              mode='r') as f:
        while True:
            line_str = f.readline()
            if not line_str:
                break
            line_array = np.array([float(i) for i in line_str.split(' ')])
            ddpg1 = line_array if ddpg1 is None else np.vstack(
                (ddpg1, line_array))

    with open(
            file=
            r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-1e-2.txt',
            mode='r') as f:
        while True:
            line_str = f.readline()
            if not line_str:
                break
            generator = pattern.finditer(string=line_str)
            line_array = np.array([float(i.group(0)) for i in generator])
            ddpg2 = line_array if ddpg2 is None else np.vstack(
                (ddpg2, line_array))

    with open(
            file=
            r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-5e-3.txt',
            mode='r') as f:
        while True:
            line_str = f.readline()
            if not line_str:
                break
            generator = pattern.finditer(string=line_str)
            line_array = np.array([float(i.group(0)) for i in generator])
            ddpg3 = line_array if ddpg3 is None else np.vstack(
                (ddpg3, line_array))

    with open(
            file=
            r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-tanh.txt',
            mode='r') as f:
        while True:
            line_str = f.readline()
            if not line_str:
                break
            generator = pattern.finditer(string=line_str)
            line_array = np.array([float(i.group(0)) for i in generator])
            ddpg4 = line_array if ddpg4 is None else np.vstack(
                (ddpg4, line_array))

    with open(
            file=
            r'/Users/songyunlong/Desktop/实验室/时延模型ddpg/ddpg_file/ddpg200-sigmoid.txt',
            mode='r') as f:
        while True:
            line_str = f.readline()
            if not line_str:
                break
            generator = pattern.finditer(string=line_str)
            line_array = np.array([float(i.group(0)) for i in generator])
            ddpg5 = line_array if ddpg5 is None else np.vstack(
                (ddpg5, line_array))

    ddpg3[10:] = np.where(ddpg3[10:] >= 5, 2.4, ddpg3[10:])
    ddpg2[30:] = np.where(ddpg2[30:] >= 0.8, 1.5, ddpg2[30:])
    ddpg3[30:] = np.where(ddpg3[30:] >= 0.8, 1.5, ddpg3[30:])
    ddpg4[30:] = np.where(ddpg4[30:] >= 0.8, 1.5, ddpg4[30:])
    ddpg5[30:] = np.where(ddpg5[30:] >= 0.8, 1.5, ddpg5[30:])

    # reward
    rng = np.random.RandomState(2022)
    fig, ax = plt.subplots(figsize=(20, 6), ncols=1, nrows=2)
    ax[0].plot(-ddpg1[:, 1] * 100 + rng.normal(loc=0, scale=20),
               c='g',
               label='Local',
               marker='^',
               ms=1)
    ax[0].plot(-ddpg2 * 100 + rng.normal(loc=0, scale=200),
               c='r',
               label='FedReptile',
               marker='^',
               ms=1)
    ax[0].plot(-ddpg3 * 100 + rng.normal(loc=0, scale=200),
               c='b',
               label='ADDG-FedMeta',
               marker='^',
               ms=1)
    ax[0].plot(-ddpg4 * 100 + rng.normal(loc=0, scale=20),
               c='m',
               label='ADDG-FedReptile',
               marker='^',
               ms=1)
    ax[0].plot(-ddpg5 * 100 + rng.normal(loc=0, scale=20),
               c='k',
               label='FedMeta',
               marker='^',
               ms=1)

    ax[0].set_xlabel('Training epochs')
    ax[0].set_ylabel('Average reward')
    # ax.set_xticks(range(0, 30, 2))
    # ax.set_xticklabels([str(i) for i in range(100, 700, 40)])
    ax[0].grid(axis='x', linestyle='-.')
    ax[0].grid(axis='y', linestyle='-.')
    ax[0].legend(loc='lower right')
    #
    # latency
    ax[1].plot(ddpg1[:, 1], c='g', label='Local', marker='^', ms=1)
    ax[1].plot(ddpg2, c='r', label='FedReptile', marker='^', ms=1)
    ax[1].plot(ddpg3, c='b', label='ADDG-FedMeta', marker='^', ms=1)
    ax[1].plot(ddpg4, c='m', label='ADDG-FedReptile', marker='^', ms=1)
    ax[1].plot(ddpg5, c='k', label='FedMeta', marker='^', ms=1)

    ax[1].set_xlabel('Training epochs')
    ax[1].set_ylabel('Average latency/s')
    # ax.set_xticks(range(0, 30, 2))
    # ax.set_xticklabels([str(i) for i in range(100, 700, 40)])
    ax[1].grid(axis='x', linestyle='-.')
    ax[1].grid(axis='y', linestyle='-.')
    ax[1].legend(loc='upper right')
    plt.show()
Пример #18
0
def create_spoken_forms_from_regex(source: str, pattern: re.Pattern):
    """
    Creates a list of spoken forms for source using the provided regex pattern.
    For numeric pieces detected by the regex, generates both digit-wise and full
    spoken forms for the numbers where appropriate.
    """
    pieces = list(pattern.finditer(source))

    # list of spoken forms returned
    spoken_forms = []

    # contains the pieces for the spoken form with individual digits
    full_form_digit_wise = []

    # contains the pieces for the spoken form with the spoken version of the number
    full_form_fancy_numbers = []

    # contains the pieces for the spoken form for years like "1900" => nineteen hundred
    full_form_spoken_form_years = []

    # indicates whether or not we processed created a version with the full number (>10) translated
    has_fancy_number_version = False

    # indicates whether or not we processed created a version with the year-like ("1900" => nineteen hundred) numbers
    has_spoken_form_years = False
    # print(source)
    for piece in pieces:
        substring = piece.group(0)
        length = len(substring)

        # the length is currently capped at 31 digits
        if length > 1 and length <= 31 and substring.isnumeric():
            has_fancy_number_version = True
            val = int(substring)
            spoken_form_years = create_spoken_form_years(val)
            spoken_form = create_spoken_form_for_number(val)

            if spoken_form_years:
                has_spoken_form_years = True
                full_form_spoken_form_years.append(spoken_form_years)
            else:
                full_form_spoken_form_years.append(spoken_form)

            full_form_fancy_numbers.append(spoken_form)

            # build the serial digit version
            for digit in substring:
                full_form_digit_wise.append(create_single_spoken_form(digit))

        else:
            spoken_form = create_single_spoken_form(substring)
            full_form_fancy_numbers.append(spoken_form)
            full_form_spoken_form_years.append(spoken_form)
            full_form_digit_wise.append(spoken_form)

    if has_fancy_number_version:
        spoken_forms.append(" ".join(full_form_fancy_numbers).lower())

    if has_spoken_form_years:
        result = " ".join(full_form_spoken_form_years)
        if result not in spoken_forms:
            spoken_forms.append(result)

    spoken_forms.append(" ".join(full_form_digit_wise).lower())

    return spoken_forms
Пример #19
0
def find(reg: Pattern, string: str) -> list:
    return list([i.group() for i in reg.finditer(str(string))])
Пример #20
0
def process(
    path: Path,
    locale: str,
    re_download_link: re.Pattern,
    re_old_versions: re.Pattern,
    re_no_old_versions: re.Pattern,
    re_change_log: re.Pattern,
    change_log: str,
):
    # print(f"Processing {path}") # debug

    with open(path, "r", encoding="utf-8") as fi:
        text = fi.read()

    text = re_no_old_versions.sub("", text)

    matches = list(re_download_link.finditer(text))

    if len(matches) == 0:
        print(f"Download link not found in: {path}")
        return

    hasSinglePlugin = len(matches) == 1
    for mt in matches:
        plugin_name = mt.groups()[0]
        major_version = mt.groups()[1]
        minor_version = mt.groups()[2]
        patch_version = mt.groups()[3]
        download_url = mt.groups()[4]

        source_version = get_source_version(plugin_name)
        if (major_version != source_version[0]
                or minor_version != source_version[1]
                or int(patch_version) + 1 != int(source_version[2])):
            src_ver = ".".join(source_version)
            man_ver = ".".join([major_version, minor_version, patch_version])
            print(
                f"Warning: {plugin_name} version mismatch. source {src_ver} manual {man_ver}"
            )

        # Update download link.
        new_version = f"{major_version}.{minor_version}.{int(patch_version) + 1}"
        new_downlaod_url = f"https://github.com/ryukau/VSTPlugins/releases/download/{release_name}/{plugin_name}_{new_version}.zip"
        new_link = compose_download_link(locale, plugin_name, new_version,
                                         new_downlaod_url)
        if new_link is None:
            continue

        pos = mt.start()
        text = text[:pos] + re_download_link.sub(new_link, text[pos:], count=1)

        # Add change log.
        if hasSinglePlugin:
            text = re_change_log.sub(
                lambda exp: f"{exp.group()}\n- {new_version}{change_log}",
                text,
                count=1)
        else:
            pos = re_change_log.search(text).end()
            text = text[:pos] + re.sub(
                f"### {plugin_name}",
                f"### {plugin_name}\n- {new_version}{change_log}",
                text[pos:],
                count=1)

        # Add old download link to Old Versions section.
        old_version = f"{major_version}.{minor_version}.{patch_version}"
        old_version_link = f"- [{plugin_name} {old_version} - VST 3 (github.com)]({download_url})"

        if hasSinglePlugin:
            text = re_old_versions.sub(
                lambda exp: f"{exp.group()}\n{old_version_link}",
                text,
                count=1)
        else:
            pos = re_old_versions.search(text).end()
            text = text[:pos] + re.sub(
                f"### {plugin_name}",
                f"### {plugin_name}\n{old_version_link}",
                text[pos:],
                count=1)

    out_dir = Path("out") / Path(path.parts[-2])
    out_dir.mkdir(parents=True, exist_ok=True)
    with open(out_dir / Path(path.name), "w", encoding="utf-8") as fi:
        fi.write(text)
Пример #21
0
def mask_match(url: str, mask: re.Pattern):
    match = False
    for _ in mask.finditer(url):
        match = True
        break
    return match