示例#1
0
def guess_split(majiribun, reading):
    kanjis=[]
    matchreg_greedy=''
    matchreg_nongreedy=''
    for char in majiribun:
        if kanji_re.match(char):
            kanjis.append(char)
            matchreg_greedy += "(\p{Hiragana}+)"
            matchreg_nongreedy += "(\p{Hiragana}+?)"
        else:
            matchreg_greedy += re.escape(char)
            matchreg_nongreedy += re.escape(char)

    m = re.match(matchreg_greedy + '$', reading)
    if m:
        yomis = m.groups()

        yomis_nongreedy = re.match(matchreg_nongreedy + '$', reading).groups()
        if yomis != yomis_nongreedy:
            # Ambiguous!
            return None
        d = {}
        for idx in range(0, len(kanjis)):
            d[kanjis[idx]] = yomis[idx]
        return(d)
示例#2
0
def process_index_delete_in_links(indx, **kwargs):
    if indx.is_commentary():
        pattern = ur'^{} on '.format(re.escape(indx.title))
    else:
        commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title")
        pattern = ur"(^{} \d)|^({}) on {} \d".format(re.escape(indx.title), "|".join(commentators), re.escape(indx.title))
    LinkSet({"refs": {"$regex": pattern}}).delete()
def dep_counts(name):
    ref_patterns = {
        'alone': r'^{} \d'.format(re.escape(name)),
        'commentor': r'{} on'.format(re.escape(name)),
        'commentee': r'on {} \d'.format(re.escape(name))
    }

    commentee_title_pattern = r'on {}'.format(re.escape(name))

    ret = {
        'version title exact match': text.VersionSet({"title": name}).count(),
        'version title match commentor': text.VersionSet({"title": {"$regex": ref_patterns["commentor"]}}).count(),
        'version title match commentee': text.VersionSet({"title": {"$regex": commentee_title_pattern}}).count(),
        'history title exact match': history.HistorySet({"title": name}).count(),
        'history title match commentor': history.HistorySet({"title": {"$regex": ref_patterns["commentor"]}}).count(),
        'history title match commentee': history.HistorySet({"title": {"$regex": commentee_title_pattern}}).count(),
    }

    for pname, pattern in ref_patterns.items():
        ret.update({
            'note match ' + pname: note.NoteSet({"ref": {"$regex": pattern}}).count(),
            'link match ' + pname: link.LinkSet({"refs": {"$regex": pattern}}).count(),
            'history refs match ' + pname: history.HistorySet({"ref": {"$regex": pattern}}).count(),
            'history new refs match ' + pname: history.HistorySet({"new.refs": {"$regex": pattern}}).count()
        })

    return ret
示例#4
0
    def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None:
        """
        Builds a codec converting between graphemes/code points and integer
        label sequences.

        charset may either be a string, a list or a dict. In the first case
        each code point will be assigned a label, in the second case each
        string in the list will be assigned a label, and in the final case each
        key string will be mapped to the value sequence of integers. In the
        first two cases labels will be assigned automatically.

        As 0 is the blank label in a CTC output layer, output labels and input
        dictionaries are/should be 1-indexed.

        Args:
            charset (unicode, list, dict): Input character set.
        """
        if isinstance(charset, dict):
            self.c2l = charset
        else:
            self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)}
        # map integer labels to code points because regex only works with strings
        self.l2c = {}  # type: Dict[str, str]
        for k, v in self.c2l.items():
            self.l2c[''.join(chr(c) for c in v)] = k

        # sort prefixes for c2l regex
        self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True)))
        # sort prefixes for l2c regex
        self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True)))
示例#5
0
    def __init__(self, keywords, fuzzy_min_len=None):
        """Initialize search
        """
        if fuzzy_min_len is None:
            fuzzy_min_len = []
        self.fuzzy_min_len = sorted(fuzzy_min_len)
        self.keywords = {}
        for i, k in keywords:
            k = k.strip().lower()
            if k not in self.keywords:
                self.keywords[k] = i
            else:
                print("ERROR: found duplicate keyword '{0}'".format(k))

        print("Number of unique keywords ID to be search: {0}"
              .format(len(self.keywords)))

        kw = []
        for k in self.keywords:
            d = self.get_allow_distance(k)
            if d:
                kw.append(r'(?:{0}){{e<={1}}}'.format(re.escape(k), d))
            else:
                kw.append(re.escape(k))

        re_str = '|'.join(kw)
        re_str = r'\b(?:{0})\b'.format(re_str)
        self.re_keywords = re.compile(re_str)
示例#6
0
def dep_counts(name):
    commentators = model.IndexSet({"categories.0": "Commentary"}).distinct("title")
    ref_patterns = {
        'alone': r'^{} \d'.format(re.escape(name)),
        'commentor': r'{} on'.format(re.escape(name)),
        'commentee': r'^({}) on {} \d'.format("|".join(commentators), re.escape(name))
    }

    commentee_title_pattern = r'^({}) on {} \d'.format("|".join(commentators), re.escape(name))

    ret = {
        'version title exact match': model.VersionSet({"title": name}).count(),
        'version title match commentor': model.VersionSet({"title": {"$regex": ref_patterns["commentor"]}}).count(),
        'version title match commentee': model.VersionSet({"title": {"$regex": commentee_title_pattern}}).count(),
        'history title exact match': model.HistorySet({"title": name}).count(),
        'history title match commentor': model.HistorySet({"title": {"$regex": ref_patterns["commentor"]}}).count(),
        'history title match commentee': model.HistorySet({"title": {"$regex": commentee_title_pattern}}).count(),
    }

    for pname, pattern in ref_patterns.items():
        ret.update({
            'note match ' + pname: model.NoteSet({"ref": {"$regex": pattern}}).count(),
            'link match ' + pname: model.LinkSet({"refs": {"$regex": pattern}}).count(),
            'history refs match ' + pname: model.HistorySet({"ref": {"$regex": pattern}}).count(),
            'history new refs match ' + pname: model.HistorySet({"new.refs": {"$regex": pattern}}).count()
        })

    return ret
示例#7
0
def process_index_title_change_in_history(indx, **kwargs):
    """
    Update all history entries which reference 'old' to 'new'.
    """
    if indx.is_commentary():
        pattern = r'{} on '.format(re.escape(kwargs["old"]))
        title_pattern = r'(^{}$)|({} on)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"]))
    else:
        commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title")
        pattern = r"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"]))
        title_pattern = r'(^{}$)|(^({}) on {})'.format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"]))

    text_hist = HistorySet({"ref": {"$regex": pattern}})
    for h in text_hist:
        h.ref = h.ref.replace(kwargs["old"], kwargs["new"], 1)
        h.save()

    link_hist = HistorySet({"new.refs": {"$regex": pattern}})
    for h in link_hist:
        h.new["refs"] = [r.replace(kwargs["old"], kwargs["new"], 1) for r in h.new["refs"]]
        h.save()

    note_hist = HistorySet({"new.ref": {"$regex": pattern}})
    for h in note_hist:
        h.new["ref"] = h.new["ref"].replace(kwargs["old"], kwargs["new"], 1)
        h.save()

    title_hist = HistorySet({"title": {"$regex": title_pattern}})
    for h in title_hist:
        h.title = h.title.replace(kwargs["old"], kwargs["new"], 1)
        h.save()
示例#8
0
    def __init__(self, keywords, fuzzy_min_len=None):
        """Initialize search
        """
        if fuzzy_min_len is None:
            fuzzy_min_len = []
        self.fuzzy_min_len = sorted(fuzzy_min_len)
        self.keywords = {}
        for i, k in keywords:
            if i not in self.keywords:
                self.keywords[i] = [k.strip().lower()]
            else:
                self.keywords[i].append(k.strip().lower())

        print("Number of unique keywords ID to be search: {0}"
              .format(len(self.keywords)))

        self.re_keywords = dict()
        for i in self.keywords:
            kw = []
            for k in self.keywords[i]:
                d = self.get_allow_distance(k)
                if d:
                    kw.append(r'(?:{0}){{e<={1}}}'.format(re.escape(k), d))
                else:
                    kw.append(re.escape(k))
            re_str = '|'.join(kw)
            re_str = r'\b(?:{0})\b'.format(re_str)
            self.re_keywords[i] = re.compile(re_str, flags=re.I)
示例#9
0
def SigWritter(uniquename, target, uniquecount, targetname, evalue):
    targetdict = SeqIO.to_dict(SeqIO.parse(target, 'fasta'))
    copy(target, uniquename + '.' + str(uniquecount))
    handle = open(uniquename, 'a+')
    if os.path.getsize(uniquename) != 0:
        mm = mmap(handle.fileno(), 0, access=ACCESS_READ)
    else:
        mm = handle
    for idline in recorddict:
        pattern = r'([^N]{' + re.escape(str(minLength)) + r',})|([ATCG]{20,}[NATCG]{' \
                   + re.escape(str(minLength)) + r',900}[ATCG]{20,})'
         #  Find a sequence of at least the target length
        regex = re.compile(pattern, re.IGNORECASE)
        uniseq = regex.finditer(recorddict[idline].seq.tostring(), overlapped=True)
        for coor in uniseq:
            isunique = True
            sequence = targetdict[idline].seq[coor.start():coor.end()].tostring()
            handle.seek(0)

            for line in handle:
                if sequence in line:
                    isunique = False
            if isunique is True:
                uniquecount += 1
                print 'Found Sequence(s) at E-value: ' + str(evalue)
                handle.write('>usid%04i_%g_%s_%s\n' % (uniquecount, evalue, targetname, idline))
                handle.write(sequence + '\n')
            # else:
            #     global evaluehit
            #     evaluehit = False
    print 'Writing %i sequence(s) to file' % uniquecount
    handle.close()
    return uniquecount
示例#10
0
文件: elements.py 项目: vshesh/glue
def inline_one(start: str, end: str, nest=Nesting.FRAME, sub=None, display=Display.INLINE):
  """
  """
  patt = re.compile(Patterns.single_group.value.format(
    re.escape(start), re.escape(end)))
  return inline(patt, escape=[start[0], end[0]],
                nest=nest, display=display, sub=sub)
示例#11
0
def expand_parens(string, parens="()", include_spaces=False, substitute_string=''):
    output = []
    open_paren = re.escape(parens[0])
    close_paren = re.escape(parens[1])
    substitute_string = re.escape(substitute_string)
    in_string = re.sub(open_paren + substitute_string, parens[0], string)
    in_string = re.sub(substitute_string + close_paren, parens[1], in_string)

    if include_spaces:
        regex1 = regex2 = re.compile(r'(^.*)' + open_paren + r'(.+)' + close_paren + r'(.*$)')
    else:
        regex1 = re.compile(r'(^.*\S)' + open_paren + r'(\S+)' + close_paren + r'(.*$)')
        regex2 = re.compile(r'(^.*)' + open_paren + r'(\S+)' + close_paren + r'(\S.*$)')

    re_match1 = regex1.search(in_string)
    re_match2 = regex2.search(in_string)
    if re_match1:
        within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3)
        without = re_match1.group(1) + re_match1.group(3)
    elif re_match2:
        within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3)
        without = re_match2.group(1) + re_match2.group(3)
    else:
        return [string]

    output = [clean_str(without), clean_str(within)]

    return output
示例#12
0
def process_index_title_change_in_history(indx, **kwargs):
    print "Cascading History {} to {}".format(kwargs['old'], kwargs['new'])
    """
    Update all history entries which reference 'old' to 'new'.
    """
    from sefaria.model.text import prepare_index_regex_for_dependency_process
    pattern = prepare_index_regex_for_dependency_process(indx)
    pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"]))
    title_pattern = ur'(^{}$)'.format(re.escape(kwargs["old"]))

    text_hist = HistorySet({"ref": {"$regex": pattern}})
    print "Cascading Text History {} to {}".format(kwargs['old'], kwargs['new'])
    for h in text_hist:
        h.ref = h.ref.replace(kwargs["old"], kwargs["new"], 1)
        h.save()

    link_hist = HistorySet({"new.refs": {"$regex": pattern}})
    print "Cascading Link History {} to {}".format(kwargs['old'], kwargs['new'])
    for h in link_hist:
        h.new["refs"] = [r.replace(kwargs["old"], kwargs["new"], 1) for r in h.new["refs"]]
        h.save()

    note_hist = HistorySet({"new.ref": {"$regex": pattern}})
    print "Cascading Note History {} to {}".format(kwargs['old'], kwargs['new'])
    for h in note_hist:
        h.new["ref"] = h.new["ref"].replace(kwargs["old"], kwargs["new"], 1)
        h.save()

    title_hist = HistorySet({"title": {"$regex": title_pattern}})
    print "Cascading Index History {} to {}".format(kwargs['old'], kwargs['new'])
    for h in title_hist:
        h.title = h.title.replace(kwargs["old"], kwargs["new"], 1)
        h.save()
示例#13
0
def _match_by_edit_distance(full_text, text_to_match):
    text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")")
    text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}")
    text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match)

    try:
        end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match))
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)]
    except:
        import sys

        print(full_text)
        print()
        print(text_to_match)
        sys.exit(1)
        
    if len(potential_matches) == 0:
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
    if len(potential_matches) == 0:
        text_to_match = text_to_match.replace("(", "[")
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0]), full_text, re.U)]

    potential_matches = [(p[0:p.rindex(text_to_match[-1])+1] 
                          if text_to_match[-1] in p and len(p) > len(text_to_match)
                          else p)
                         for p in potential_matches]

    if len(potential_matches) == 0:
        # No idea why this would ever happen, but it does
        return text_to_match

    match_with_lowest_edit_distance = ""
    lowest_edit_distance = -1
    for match in potential_matches:
        e_d = edit_distance(match, text_to_match)
        if lowest_edit_distance == -1 or e_d <= lowest_edit_distance:
            lowest_edit_distance = e_d
            match_with_lowest_edit_distance = match

    result = match_with_lowest_edit_distance.strip()
    if text_to_match[-1] in result:
        while result[-1] != text_to_match[-1]:
            result = result[0:-1]
    elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result):
        while result[-1] not in ['"', '”', "\u201d"]:
            result = result[0:-1]
    elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..":
        while result[-1] != text_to_match[-1]:
            result += full_text[full_text.index(result) + len(result)][-1]

    return result
示例#14
0
def process_index_title_change_in_links(indx, **kwargs):
    if indx.is_commentary():
        pattern = r'^{} on '.format(re.escape(kwargs["old"]))
    else:
        commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title")
        pattern = r"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"]))
        #pattern = r'(^{} \d)|( on {} \d)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"]))
    links = LinkSet({"refs": {"$regex": pattern}})
    for l in links:
        l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(pattern, r) else r for r in l.refs]
        l.save()
示例#15
0
def process_index_title_change_in_notes(indx, **kwargs):
    print "Cascading Notes {} to {}".format(kwargs['old'], kwargs['new'])
    pattern = Ref(indx.title).regex()
    pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"]))
    notes = NoteSet({"ref": {"$regex": pattern}})
    for n in notes:
        try:
            n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1)
            n.save()
        except Exception:
            logger.warning("Deleting note that failed to save: {}".format(n.ref))
            n.delete()
示例#16
0
def process_index_title_change_in_links(indx, **kwargs):
    print "Cascading Links {} to {}".format(kwargs['old'], kwargs['new'])
    pattern = text.Ref(indx.title).regex()
    pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"]))
    links = LinkSet({"refs": {"$regex": pattern}})
    for l in links:
        l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(pattern, r) else r for r in l.refs]
        try:
            l.save()
        except InputError: #todo: this belongs in a better place - perhaps in abstract
            logger.warning("Deleting link that failed to save: {} - {}".format(l.refs[0], l.refs[1]))
            l.delete()
示例#17
0
def process_index_title_change_in_notes(indx, **kwargs):
    if indx.is_commentary():
        pattern = r'{} on '.format(re.escape(kwargs["old"]))
    else:
        commentators = IndexSet({"categories.0": "Commentary"}).distinct("title")
        pattern = r"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"]))
    notes = NoteSet({"ref": {"$regex": pattern}})
    for n in notes:
        try:
            n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1)
            n.save()
        except Exception:
            pass #todo: log me, and wrap other handlers in try/catch
示例#18
0
def process_index_title_change_in_links(indx, **kwargs):
    print "Cascading Links {} to {}".format(kwargs['old'], kwargs['new'])
    patterns = [pattern.replace(re.escape(indx.title), re.escape(kwargs["old"]))
                for pattern in text.Ref(indx.title).regex(as_list=True)]
    queries = [{'refs': {'$regex': pattern}} for pattern in patterns]
    links = LinkSet({"$or": queries})
    for l in links:
        l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(u'|'.join(patterns), r) else r for r in l.refs]
        try:
            l.save()
        except InputError: #todo: this belongs in a better place - perhaps in abstract
            logger.warning("Deleting link that failed to save: {} - {}".format(l.refs[0], l.refs[1]))
            l.delete()
示例#19
0
 def __init__(self, leading_allow=None, trailing_allow=None):
     """
     :param list leading_allow: The leading punctuation characters to allow.
     :param list trailing_allow: The trailing punctuation characters to allow.
     """
     leading_pattern = "" if not leading_allow else r"[%s]*" % regex.escape("".join(leading_allow))
     trailing_pattern = "" if not trailing_allow else r"[%s]" % regex.escape("".join(trailing_allow))
     if trailing_pattern:
         super(OuterPunctuationFilter, self).__init__(
             "%s[^\W_]+(?:$|.*[^\W_]%s*|%s*)" % (leading_pattern, trailing_pattern, trailing_pattern)
         )
     else:
         super(OuterPunctuationFilter, self).__init__("%s[^\W_](?:$|.*[^\W_])" % leading_pattern)
示例#20
0
def process_index_title_change_in_links(indx, **kwargs):
    if indx.is_commentary():
        pattern = r'^{} on '.format(re.escape(kwargs["old"]))
    else:
        commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title")
        pattern = ur"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"]))
        #pattern = r'(^{} \d)|( on {} \d)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"]))
    links = LinkSet({"refs": {"$regex": pattern}})
    for l in links:
        l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(pattern, r) else r for r in l.refs]
        try:
            l.save()
        except InputError: #todo: this belongs in a better place - perhaps in abstract
            logger.warning("Deleting link that failed to save: {} {}".format(l.refs[0], l.refs[1]))
            l.delete()
示例#21
0
def match(pat=None, pat_args=None):
    """Generates a regular expression to match a dictionary entry.

    In pat, '$foo' matches the contents of pat_args['foo'] or nothing, if
    pat_args['foo'] exists. If it doesn't exist, it matches /.*?/. In
    pat_args['foo'], '$foo' matches /.*?/, and '$bar' is expanded as it would
    be in pat.

    Args:
        pat: (Optional) The format of the entry. Defaults to
            '$word$pron$pos$cl$de'.
        pat_args: (Optional) The expansions for variables. If pat is
            unspecified, defaults to {
                'pron': '/$pron/',
                'pos': ' - $pos',
                'cl': ' ($cl$subcl)',
                'subcl': '.$subcl',
                'de': ': $de'
            }, otherwise, defaults to {}

    Returns:
        A regular expression which will match a dictionary entry in the
        specified format. Fields mentioned in the pattern with '$' can be
        accessed as named capture groups of the match object.
    """
    if pat is None:
        pat = default_pat
        if pat_args is None:
            pat_args = default_pat_args
    pat_args = pat_args or {}
    args = {}
    for f in pat_args:
        args[f] = regex.escape(pat_args[f], True).replace(r'\$', '$')
        m = regex.search(var_matcher(f), args[f])
        while m is not None:
            sp = (m.start(1) - 1, m.end(1))
            args[f] = workers.slice_replace(args[f], sp, var_group(f))
            m = regex.search(var_matcher(f), args[f])
    pat = '^' + regex.escape(pat, True).replace(r'\$', '$') + '$'
    m = var_match.search(pat)
    while m is not None:
        sp = (m.start(1) -1, m.end(1))
        f = m.group(1)
        pat = workers.slice_replace(pat, sp,
                                    '({})?'.format(args.get(f, var_group(f))))
        m = var_match.search(pat)
    pat = pat.replace(' ', r'\s+')
    return regex.compile(pat)
def constructErrorRegex( Error, Context ):
    LEFTCONTEXT = 0
    RIGHTCONTEXT = 1

    LeftContext = regex.escape( Context[ LEFTCONTEXT ].lstrip( ) )

    RightContext = regex.escape( Context[ RIGHTCONTEXT ].rstrip( ) )

    minMaxLen = r"{" + str( int( len( Error ) * (1 / 2) ) ) + r"," + str( math.ceil( len( Error ) * 1.8 ) ) + r"}"

    fullPattern = r"(?:(?:" + LeftContext + r"){1s+1i+1d<=7})(?=." + minMaxLen + r"(?:(?:" + RightContext + "){" \
                                                                                                            "1s+1i+1d<=6}))" + \
                  r"(?P<errorMatch>(?:\w++[\-\']?\w*+)(?:(?=(?:" + RightContext + r"){1s+1i+1d<=6})|\W{1,2}))+?" + \
                  r"(?:(?:" + RightContext + "){1s+1i+2d<=6})"

    return regex.compile( fullPattern, regex.BESTMATCH | regex.V1 )
示例#23
0
    def _escape(self, match):
        '''Escape matched 'other' group value.'''
        groups = match.groupdict()
        if groups['other'] is not None:
            return _regex.escape(groups['other'])

        return groups['placeholder']
示例#24
0
def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece)
                           for piece in entries if piece.strip()])
    return re.compile(expression)
示例#25
0
 def __init__(self, n, initial_count, true_words, feature_chart, hypothetical_phonotactics=False,
              score_combiner=lambda scores: reduce(mul, scores), subseq_counts=None, diphthongs=None):
     self._vowels = feature_chart.phones_for_features("+syllabic")
     self._vowel_re = re.compile("[" + ''.join([re.escape(vowel) for vowel in self._vowels if len(vowel) == 1]) + "]")
     self._diphthongs = set(diphthongs) if diphthongs is not None else set()
     super(SyllableNgramCue, self).__init__(n, initial_count, len(set(chain(*[self.syllabify(word) for word in true_words]))),
           hypothetical_phonotactics=hypothetical_phonotactics, score_combiner=score_combiner, subseq_counts=subseq_counts)
示例#26
0
 def get_pattern(cls):
     patterns = []
     for alternative in cls.alternatives:
         pattern = regex.escape(alternative)
         if regex.fullmatch(ur'\w', alternative[-1]):
             pattern += ur'\b'
         patterns.append(pattern)
    def __init__(self, test_file, temp_dir, cache=False):
        self.tic = time.time()
        self.test_file = test_file
        self.temp_dir = temp_dir
        self.cache = cache

        self.failed = False
        self.lines = []
        clean_test_file_name = re.sub('^' + re.escape(test_root('data/')), '',
                                      test_file)
        self.say('{}', test_started(clean_test_file_name))
        self.say("Testing {}...", bold(clean_test_file_name))

        self.style = self._get_style()
        sh.mkdir('-p', fail_path(self.style))
        self.style_args = Test._get_style_options(self.style)
        if self.style_args:
            self.say("\tstyling: {}", shell_join(self.style_args))
        self.bib_args = Test._get_bib_options(test_file)
        if self.bib_args:
            self.say("\tbibliography: {}", self.bib_args)
        self.options = self.style_args + self.bib_args

        self.test_name = os.path.join(self.style, os.path.basename(test_file))
        self.test_out = os.path.join(self.temp_dir, self.test_name)
        self.test_err = self.test_out + '.err'
        _, ext = os.path.splitext(test_file)
        self.test_new = self.test_out + '.new.' + ext
示例#28
0
def add_spaces(text, exclude=None):
    if exclude:
        patt_exclude = regex.escape(exclude)
        patt_eng_cjk = regex.compile(u"([[%s]--%s])([%s])" % (CHAR_ENG_LEFT, patt_exclude, CHAR_CJK))
        patt_cjk_eng = regex.compile(u"([%s])([[%s]--%s])" % (CHAR_CJK, CHAR_ENG_RIGHT, patt_exclude))
    else:
        patt_eng_cjk = PATTERN_ENG_CJK
        patt_cjk_eng = PATTERN_CJK_ENG

    def add_space_func(index1, index2):
        def add_space(match):
            return u"%s %s" % (match.group(index1), match.group(index2))

        return add_space

    text = patt_cjk_eng.subn(add_space_func(1, 2), text)[0]
    text = patt_eng_cjk.subn(add_space_func(1, 2), text)[0]

    if not (exclude and '"' in exclude):
        # XXX"YYY"XXX -> XXX "YYY" XXX
        # where X and Y are CJK charaters
        is_left_dquote = True
        is_left_squote = True
        out = StringIO.StringIO()
        for i in xrange(len(text)):
            prev_char = text[i - 1] if i > 0 else None
            cur_char = text[i]
            next_char = text[i + 1] if i < len(text) - 1 else None
            if cur_char == u'"':
                if is_left_dquote:
                    if _is_cjk(prev_char):
                        out.write(u' "')
                    else:
                        out.write(u'"')
                    is_left_dquote = False
                else:
                    if _is_cjk(next_char):
                        out.write(u'" ')
                    else:
                        out.write(u'"')
                    is_left_dquote = True
            elif cur_char == u"'":
                if is_left_squote:
                    if _is_cjk(prev_char):
                        out.write(u" '")
                    else:
                        out.write(u"'")
                    is_left_squote = False
                else:
                    if _is_cjk(next_char):
                        out.write(u"' ")
                    else:
                        out.write(u"'")
                    is_left_squote = True
            else:
                out.write(cur_char)
        text = out.getvalue()
        out.close()

    return text
示例#29
0
文件: reports.py 项目: thuvh/calibre
    def find_next_location(self, entry):
        from calibre.gui2.tweak_book.boss import get_boss
        boss = get_boss()
        if boss is None:
            return
        files = entry.usage
        current_editor_name = boss.currently_editing
        if current_editor_name not in files:
            current_editor_name = None
        else:
            idx = files.index(current_editor_name)
            before, after = files[:idx], files[idx+1:]
            files = [current_editor_name] + after + before + [current_editor_name]

        pat = regex.compile(regex.escape(entry.char))
        for file_name in files:
            from_cursor = False
            if file_name == current_editor_name:
                from_cursor = True
                current_editor_name = None
            ed = boss.edit_file_requested(file_name)
            if ed is None:
                return
            if ed.editor.find(pat, complete=not from_cursor):
                boss.show_editor(file_name)
                return True
        return False
示例#30
0
文件: util.py 项目: kunbud1989/spaCy
def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
        return re.compile(expression)
示例#31
0
    def hightlight_keywords(self,
                            text,
                            keywords,
                            light_color='#ffea593d',
                            deep_color='#ffc107'):
        hightlighted_html = ''
        all_hightlights = []
        tokens = self.full_tokenize(text)
        for t in tokens:
            t['background_color'] = []

        for w in keywords:
            matches = regex.finditer(r'\b{}\b'.format(
                regex.escape(w, special_only=True)),
                                     text,
                                     flags=regex.IGNORECASE)
            all_hightlights.extend([{
                'start': m.start(),
                'end': m.end(),
                'text': m.group(),
            } for m in matches])

        all_hightlights = sorted(all_hightlights, key=lambda x: x['start'])

        for h in all_hightlights:
            for t in tokens:
                if (t['start'] >= h['start'] and t['end'] <= h['end']):
                    t['background_color'].append(light_color)

        for t in tokens:
            color_len = len(t['background_color'])
            if color_len == 0:
                hightlighted_html += t['text']
            elif color_len == 1:
                hightlighted_html += \
                    '<span style="background-color:{background_color};">{text}</span>'.format(
                    background_color=t['background_color'][0],
                    text=t['text']
                )
            else:
                hightlighted_html += \
                    '<span style="background-color:{background_color};">{text}</span>'.format(
                    background_color=deep_color,
                    text=t['text']
                )
        return hightlighted_html
示例#32
0
    def test_set_redirect_target(self):
        """Test set_redirect_target method."""
        # R1 redirects to R2 and R3 doesn't exist.
        site = self.get_site('en')
        p1 = pywikibot.Page(site, 'User:Legoktm/R2')
        p2 = pywikibot.Page(site, 'User:Legoktm/R1')
        p3 = pywikibot.Page(site, 'User:Legoktm/R3')

        text = p2.get(get_redirect=True)
        with self.assertRaisesRegex(
                IsNotRedirectPageError,
                r'{} is not a redirect page\.'.format(re.escape(str(p1)))):
            p1.set_redirect_target(p2)
        with self.assertRaisesRegex(NoPageError, NO_PAGE_RE):
            p3.set_redirect_target(p2)
        p2.set_redirect_target(p1, save=False)
        self.assertEqual(text, p2.get(get_redirect=True))
示例#33
0
文件: text.py 项目: osewadare/calibre
 def go_to_anchor(self, anchor):
     if anchor is TOP:
         c = self.textCursor()
         c.movePosition(c.Start)
         self.setTextCursor(c)
         return True
     base = r'''%%s\s*=\s*['"]{0,1}%s''' % regex.escape(anchor)
     raw = unicode(self.toPlainText())
     m = regex.search(base % 'id', raw)
     if m is None:
         m = regex.search(base % 'name', raw)
     if m is not None:
         c = self.textCursor()
         c.setPosition(m.start())
         self.setTextCursor(c)
         return True
     return False
示例#34
0
def find_sites_that_may_have_removed_linker(last_linker_activity_day=20):
    """
    Checks for each site whether there has been a webpage hit with the linker in the last `last_linker_activity_day` days
    Prints an alert for each site that doesn't meet this criterion
    """
    sites_to_delete = {}
    sites_to_keep = {}
    from datetime import datetime, timedelta
    last_active_threshold = datetime.today() - timedelta(
        days=last_linker_activity_day)
    webpages_without_websites = 0
    for data in get_website_cache():
        if data["is_whitelisted"]:  # we only care about whitelisted sites
            for domain in data['domains']:
                ws = WebPageSet({"url": {
                    "$regex": re.escape(domain)
                }},
                                limit=1,
                                sort=[['lastUpdated', -1]])
                keep = True
                if ws.count() == 0:
                    sites_to_delete[domain] = f"{domain} has no pages"
                    keep = False
                else:
                    webpage = ws[0]  # lastUpdated webpage for this domain
                    website = webpage.get_website()
                    if website:
                        website.linker_installed = webpage.lastUpdated > last_active_threshold
                        if not website.linker_installed:
                            keep = False
                            print(f"Alert! {domain} has removed the linker!")
                            sites_to_delete[
                                domain] = f"{domain} has {website.num_webpages} pages, but has not used the linker in {last_linker_activity_day} days. {webpage.url} is the oldest page."
                    else:
                        print(
                            "Alert! Can't find website {} corresponding to webpage {}"
                            .format(data["name"], webpage.url))
                        webpages_without_websites += 1
                        continue
                if keep:
                    assert domain not in sites_to_delete
                    sites_to_keep[domain] = True
    if webpages_without_websites > 0:
        print("Found {} webpages without websites".format(
            webpages_without_websites))
    return sites_to_delete
示例#35
0
    def check_path(self, url):
        self.logger.info(f'Checking for url {url}')
        blacklisted_paths = ['/', '/index.php', None, '']
        vulnsss = []

        places = ["true_vulns", "almost_true", "probable_vulns"]
        for place in places:
            vulnsss.extend(self.vulns[place])

        if url not in blacklisted_paths:
            for doc in self.collection.find(
                {"URI": {
                    '$regex': regex.escape(url)
                }}):
                vuln = doc.get('Vulnerability')
                if vuln not in vulnsss:
                    self.vulns["possible_vulns"].append(vuln)
    def _update_processed_text(self, original_number_list):
        """
        Replaces detected date with tag generated from entity_name used to initialize the object with

        A final string with all dates replaced will be stored in object's tagged_text attribute
        A string with all dates removed will be stored in object's processed_text attribute

        Args:
            original_number_list (list): list of substrings of original text to be replaced with tag
                                       created from entity_name
        """
        for detected_text in original_number_list:
            _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(
                re.escape(detected_text)),
                                  flags=_re_flags)
            self.tagged_text = _pattern.sub(self.tag, self.tagged_text, 1)
            self.processed_text = _pattern.sub('', self.processed_text, 1)
示例#37
0
    def cleanData(self, text):
        clean = bytes(text.lower().encode('utf-8'))
        clean = clean.replace(b"\n", b" ")
        clean = clean.replace(b"\t", b" ")
        clean = clean.replace(b"\b", b" ")
        clean = clean.replace(b"\r", b" ")

        exclude = r.compile(
            b'[%s]' % re.escape(bytes(string.punctuation.encode('utf-8'))))
        clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
        clean = r.sub(b"\d+", b" ", clean)
        clean = r.sub(b'\\s+', b' ', clean)
        clean = r.sub(b'\s+', b' ', clean)
        clean = r.sub(b'\s+$', b'', clean)
        clean = r.sub(b'\s*', b'', clean)

        return str(clean).encode('utf-8')
示例#38
0
def regexizeTagGlob(tag):
    '''
    Returns:
        a regular expression string with ** and * interpreted as tag globs

    Precondition:
        tag is a valid tagmatch

    Notes:
        A single asterisk will replace exactly one dot-delimited component of a tag
        A double asterisk will replace one or more of any character.

        The returned string does not contain a starting '^' or trailing '$'.
    '''
    return ReRegex.sub(
        lambda m: r'([^.]+?)' if m.group(1) is None else r'(.+)',
        regex.escape(tag))
示例#39
0
        def multiple_replace(string, rep_dict):
            """
            Replace multiple string patterns simultaneously.

            Args:
                string: The string to be replaced.
                rep_dict: Dictionary containing key and values as patterns that
                    should be replaced.

            Returns:
                On success: The string with all the patterns replaced.
                On failure: False.

            """
            pattern = re.compile(
                "|".join([re.escape(k) for k in rep_dict.keys()]), re.M)
            return pattern.sub(lambda x: rep_dict[x.group(0)], string)
示例#40
0
def _process_regex_dict(regex_dict, regex_escape=False, **kwargs):
    regex_pattern_list = []
    for key, val in regex_dict.items():
        new_val = list()
        for item in val:
            if regex_escape:
                new_val.append(regex.escape(item))
            else:
                new_val.append(item)
        # regex_dict[key] = new_val
        regex_pattern_list += new_val
    #end for
    emoticon_regex_pattern = '|'.join(regex_pattern_list)


    # return '(' + '|'.join(regex_pattern_list) + r')*[.?!]\s*'
    return '(((' + emoticon_regex_pattern + r')\s*)*([.!?]+\s+))|(((' + emoticon_regex_pattern + r')\s*)+([.!?]*\s+))'
示例#41
0
def check_urls(index, question, answer):
    global full_sentence_valid_url

    full_sentence_valid_url = False
    valid_url = False

    # Disabled
    if score_settings['incorrect_url_modifier_value'] is None:
        return 0

    # Find all utls in sentence
    for url in re.finditer(
            'http(?:s?):(//([^/]*?)/(?:[^ ])*?(?=$|[' +
            re.escape(score_settings['url_delimiters']) + ']))?', answer):

        # Check if result is in cache already and return it
        if url_cache[url.group(0)][1] > time.time():
            if url_cache[url.group(0)][0] == 0:
                return score_settings['incorrect_url_modifier_value']

        # Url not in cache - check it
        else:

            # Send HEAD request and check HTTP response code
            try:
                request = requests.head(url.group(0))
                code = request.status_code
            except Exception as e:
                code = 0

            # Add to cache
            url_cache[url.group(0)] = [
                1 if code == 200 else 0,
                time.time() + 86400
            ]

            # If code is diffrent than 200 - return modifier value
            if code != 200:
                return score_settings['incorrect_url_modifier_value']

        # Check if it's full sentence url
        valid_url = (len(url.group(0)) == len(answer))

    # Everyting ok, set if full sentence url and return 0
    full_sentence_valid_url = valid_url
    return 0
示例#42
0
def process_index_title_change_in_history(indx, **kwargs):
    print "Cascading History {} to {}".format(kwargs['old'], kwargs['new'])
    """
    Update all history entries which reference 'old' to 'new'.
    """
    if indx.is_commentary():
        pattern = ur'{} on '.format(re.escape(kwargs["old"]))
        title_pattern = ur'(^{}$)|({} on)'.format(re.escape(kwargs["old"]),
                                                  re.escape(kwargs["old"]))
    else:
        pattern = text.Ref(indx.title).base_text_and_commentary_regex()
        pattern = pattern.replace(re.escape(indx.title),
                                  re.escape(kwargs["old"]))
        commentators = text.library.get_commentary_version_titles_on_book(
            kwargs["old"], with_commentary2=True)
        title_pattern = ur'(^{}$)|(^({}) on {}$)'.format(
            re.escape(kwargs["old"]), "|".join(commentators),
            re.escape(kwargs["old"]))

    text_hist = HistorySet({"ref": {"$regex": pattern}})
    print "Cascading Text History {} to {}".format(kwargs['old'],
                                                   kwargs['new'])
    for h in text_hist:
        h.ref = h.ref.replace(kwargs["old"], kwargs["new"], 1)
        h.save()

    link_hist = HistorySet({"new.refs": {"$regex": pattern}})
    print "Cascading Link History {} to {}".format(kwargs['old'],
                                                   kwargs['new'])
    for h in link_hist:
        h.new["refs"] = [
            r.replace(kwargs["old"], kwargs["new"], 1) for r in h.new["refs"]
        ]
        h.save()

    note_hist = HistorySet({"new.ref": {"$regex": pattern}})
    print "Cascading Note History {} to {}".format(kwargs['old'],
                                                   kwargs['new'])
    for h in note_hist:
        h.new["ref"] = h.new["ref"].replace(kwargs["old"], kwargs["new"], 1)
        h.save()

    title_hist = HistorySet({"title": {"$regex": title_pattern}})
    print "Cascading Index History {} to {}".format(kwargs['old'],
                                                    kwargs['new'])
    for h in title_hist:
        h.title = h.title.replace(kwargs["old"], kwargs["new"], 1)
        h.save()
示例#43
0
    def full_regex(self, title, lang, anchored=True, compiled=True, **kwargs):
        """
        :return: Regex object. If for_js == True, returns the Regex string
        :param for_js: Defaults to False
        :param match_range: Defaults to False

        A call to `full_regex("Bereishit", "en", for_js=True)` returns the follow regex, expanded here for clarity :
        ```
        Bereishit                       # title
        [,.: \r\n]+                     # a separator (self.after_title_delimiter_re)
        (?:                             # Either:
            (?:                         # 1)
                (\d+)                   # Digits
                (                       # and maybe
                    [,.: \r\n]+         # a separator
                    (\d+)               # and more digits
                )?
            )
            |                           # Or:
            (?:                         # 2: The same
                [[({]                   # With beginning
                (\d+)
                (
                    [,.: \r\n]+
                    (\d+)
                )?
                [])}]                   # and ending brackets or parens or braces around the numeric portion
            )
        )
        (?=                             # and then either
            [.,;?! })<]                 # some kind of delimiting character coming after
            |                           # or
            $                           # the end of the string
        )
        ```
        Different address type / language combinations produce different internal regexes in the innermost portions of the above, where the comments say 'digits'.

        """
        reg = ur"^" if anchored else ""
        reg += regex.escape(title) + self.after_title_delimiter_re
        addr_regex = self.address_regex(lang, **kwargs)
        reg += ur'(?:(?:' + addr_regex + ur')|(?:[\[({]' + addr_regex + ur'[\])}]))'  # Match expressions with internal parenthesis around the address portion
        reg += ur"(?=\W|$)" if not kwargs.get(
            "for_js") else ur"(?=[.,:;?! })\]<]|$)"
        return regex.compile(reg, regex.VERBOSE) if compiled else reg
def safe_replace(original, to_replace, replacement):
    """
    Utility that will replace the string except in the HTML tag attributes
    :param original: original string
    :param to_replace: string to replace
    :param replacement: replacement string
    :return: new string with the replacement done
    """
    def _replace(match):
        if match.group(1):
            return match.group(0)
        else:
            return replacement

    replace_regex = re.escape(to_replace.replace("\\", ""))
    in_attr = r'((?:<[^<>]*?"[^<>]*?){1}' + replace_regex + r'(?:[^<>]*?"[^<>]*?>){1})'
    regex = in_attr + r"|(" + replace_regex + r")"
    return re.sub(regex, _replace, original)
示例#45
0
    def __convertterminalvalue(self,prefix,value,**kwargs):
        """Internal method converts "terminal values" to characters

            Parameters:
                prefix -> The prefix (like x for hex, b for binary, d for decimal)
                value -> The actual number (without the prefix and the %)
            Returns:
                char -> The character representation of the terminalvalue \
                    as string (escaped if needed)
        """
        if prefix == "b":
            char=b2a_qp(binvalue)
        if prefix == 'd':
            char=chr(charater)
        if prefix == 'x':
            print ("Value:", value)
            char=binascii.unhexlify(value).decode()
        return regex.escape(char)
示例#46
0
    def clean_article(self, art):
        # remove unneccesary fields in the title like newspaper name etc.
        titles = art['title'].split('|')
        art['title'] = max(titles, key=len)

        # remove title from the body content
        # boilerpipe tends to include the title also in the body content
        pattern = regex.compile('({}){{e<=5}}'.format(
            regex.escape(art['title'])))
        match = pattern.search(art['body'])
        if match:
            end_idx = match.span()[1]
            art['body'] = art['body'][end_idx:]

        art['title'] = self._strip_txt(art['title'])
        art['body'] = self._strip_txt(art['body'])

        return art
示例#47
0
def text2sentences(path):
    '''
    Converts a raw text from path to tokenized sentences 
    '''
    concat_text = text_concat(path)
    tok_sentences = []
    # Split at each symbol that ends a sentence e.g. '.?!'
    sents = re.split(r'[.?!]', concat_text)

    # Get rid of punctuation
    punct = re.compile('[%s]' % re.escape(string.punctuation))
    # Tokenize
    for phrase in sents:
        no_punct = punct.sub('', phrase)
        tok_sentences.append(no_punct.lower().split())

    #return undersample_sentences(tok_sentences)
    return tok_sentences  #comment this line & uncomment the one above if we want to use the undersampling
示例#48
0
def xmatch(lookup_value, lookup_array, match_type=1):
    res = [Error.errors['#N/A']]
    t_id = _get_type_id(lookup_value)
    if match_type > 0:
        def check(j, x, val, r):
            if x <= val:
                r[0] = j
                return x == val and j > 1
            return j > 1

    elif match_type < 0:
        def check(j, x, val, r):
            if x < val:
                return True
            r[0] = j
            return v == val

    else:
        t_id = _get_type_id(lookup_value)
        if t_id == 1:
            def sub(m):
                return {'\\': '', '?': '.', '*': '.*'}[m.groups()[0]]

            match = regex.compile(r'^%s$' % regex.sub(
                r'(?<!\\\~)\\(?P<sub>[\*\?])|(?P<sub>\\)\~(?=\\[\*\?])', sub,
                regex.escape(lookup_value)
            ), regex.IGNORECASE).match
        else:
            match = lambda x: x == lookup_value

        # noinspection PyUnusedLocal
        def check(j, x, val, r):
            if match(x):
                r[0] = j

    convert = lambda x: x
    if t_id == 1:
        convert = lambda x: x.upper()

    lookup_value = convert(lookup_value)
    for i, v in _yield_vals(t_id, lookup_array):
        if check(i, convert(v), lookup_value, res):
            break
    return res[0]
示例#49
0
def merge_vocab(pair: Tuple[str, str],
                input_vocab: Dict[str, int]) -> Tuple[Dict[str, int], List]:
    """
    >>> pair = ('w', 'o')
    >>> input_vocab = {'b i r d @': 3, 'w o r d @': 7, 'w o g @': 13}
    >>> new_vocab, new_pairs = merge_vocab(pair, input_vocab)
    >>> new_vocab
    {'b i r d @': 3, 'wo r d @': 7, 'wo g @': 13}
    >>> new_pairs
    [(('wo', 'r'), 7), (('o', 'r'), -7), (('wo', 'g'), 13), (('o', 'g'), -13)]
    """
    output_vocab = {}
    concat_pair_with_space = ' '.join(pair)
    concat_pair_with_space_escaped = regex.escape(concat_pair_with_space)
    concat_pair = ''.join(pair)
    reg = regex.compile('(^|[^ ]+ )(' + concat_pair_with_space_escaped +
                        ')( [^ ]+|$)')
    added_pairs = []
    for word in input_vocab:
        word_occurences = input_vocab[word]
        match = reg.search(word)
        while match:
            # word changed
            if match.group(1) != '':
                subtoken_before = match.group(1)[:-1]
                added_pairs.append(
                    ((subtoken_before, concat_pair), word_occurences))
                if pair != (subtoken_before, pair[0]):
                    added_pairs.append(
                        ((subtoken_before, pair[0]), -word_occurences))
            if match.group(3) != '':
                subtoken_after = match.group(3)[1:]
                added_pairs.append(
                    ((concat_pair, subtoken_after), word_occurences))
                if pair != (pair[1], subtoken_after):
                    added_pairs.append(
                        ((pair[1], subtoken_after), -word_occurences))
            start, end = match.span(2)
            replacement = concat_pair
            word = word[:start] + replacement + word[end:]
            match = reg.search(word)
        output_vocab[word] = word_occurences
    return output_vocab, added_pairs
示例#50
0
 def __init__(self):
     self.window = 5
     self.entity_prior = {}
     self.me_prob = {}
     self.mention_cand = {}
     self.m_count = {}
     self.punc = re.compile('[%s]' % re.escape(string.punctuation))
     self.log_file = ''
     self.total_p = 0
     self.total_tp = 0
     self.doc_actual = 0
     self.mention_actual = 0
     self.total_cand_num = 0
     self.miss_senses = set()
     self.gamma = 0.1        # to smooth the pem
     self.is_local = False
     self.is_global = False
     self.is_prior = False
     self.input_path = ''
示例#51
0
def jump_to_location(loc):
    from calibre.gui2.tweak_book.boss import get_boss
    boss = get_boss()
    if boss is None:
        return
    name = loc.name
    editor = boss.edit_file_requested(name)
    if editor is None:
        return
    editor = editor.editor
    if loc.line_number is not None:
        block = editor.document().findBlockByNumber(loc.line_number - 1)  # blockNumber() is zero based
        if not block.isValid():
            return
        c = editor.textCursor()
        c.setPosition(block.position(), c.MoveAnchor)
        editor.setTextCursor(c)
        if loc.text_on_line is not None:
            editor.find(regex.compile(regex.escape(loc.text_on_line)))
示例#52
0
文件: dep.py 项目: solsword/quiche
    def decorate(function):
        global TARGET_GENERATORS

        tre = re.escape(output.format(iter=DC1, next=DC2))
        tre = tre.replace(DC1, r"(?P<iter>[0-9]+)")
        tre = tre.replace(DC2, r"(?P<next>[0-9]+)")

        def gen_target(name_match, stuff):
            inputs, params, function, flags = stuff

            try:
                ival = int(name_match.group("iter"))
            except IndexError:
                ival = None
            try:
                nval = int(name_match.group("next"))
            except IndexError:
                nval = None

            if ival == None and nval != None:
                if nval <= 0:
                    ival = "start"
                else:
                    ival = nval - 1
            elif ival != None and nval == None:
                nval = ival + 1
            elif ival == None or nval == None:
                ival = "start"
                nval = 0

            inputs = [inp.format(iter=ival, next=nval) for inp in inputs]
            params = [param.format(iter=ival, next=nval) for param in params]

            def wrapped(*args, **kwargs):
                nonlocal function, nval
                return function(nval, *args, **kwargs)

            wrapped.__name__ = function.__name__
            return inputs, params, wrapped, flags

        TARGET_GENERATORS[tre] = (gen_target, (inputs, params, function,
                                               flags))
        return function
示例#53
0
    def mask_by_term(self,
                     orig_source,
                     orig_target: Optional[str] = None,
                     prob=1.0):
        """
        Masks using dictionary entries.
        """

        source_masks = []
        source = orig_source
        target = orig_target
        for term, (translation, label) in self.terms.items():
            if term in source:
                pattern = r'\b{}\b'.format(re.escape(term))
                source, target, term_masks = self.get_label_masks(
                    label, pattern, translation, source, target, prob)
                source_masks.extend(term_masks)

        return source, target, source_masks
示例#54
0
def get_search_regex(state):
    raw = state['find']
    if state['mode'] != 'regex':
        raw = regex.escape(raw, special_only=True)
    flags = REGEX_FLAGS
    if not state['case_sensitive']:
        flags |= regex.IGNORECASE
    if state['mode'] == 'regex' and state['dot_all']:
        flags |= regex.DOTALL
    if state['direction'] == 'up':
        flags |= regex.REVERSE
    ans = regex_cache.get((flags, raw), None)
    if ans is None:
        try:
            ans = regex_cache[(flags, raw)] = regex.compile(raw, flags=flags)
        except regex.error as e:
            raise InvalidRegex(raw, e)

    return ans
示例#55
0
    def lookup(self, lemma: str) -> str:
        """Perform match of a lemma against headwords. If more than one match,
        then return the concatenated entries. For example:

        >>> from cltk.lexicon.lat import LatinLewisLexicon
        >>> lll = LatinLewisLexicon(interactive=False)
        >>> lll.lookup("clemens")[:50]
        'clēmēns entis (abl. -tī; rarely -te, L.), adj. wit'
        >>> all(word in lll.lookup("levis") for word in ["levis","lēvis"]) # Test for concatenated entries
        True
        >>> lll.lookup("omnia")
        ''
        >>> lll.lookup(".")
        ''
        >>> lll.lookup("123")
        ''
        >>> lll.lookup("175.")
        ''
        >>> lll.lookup("(") # Test for regex special character
        ''
        """
        if not self.entries:
            raise CLTKException(
                "No lexicon entries found in the .yaml file. This should never happen."
            )

        if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None:
            return ""

        lemma = regex.escape(lemma.lower())

        keys = self.entries.keys()
        matches = [
            key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key)
        ]
        n_matches = len(matches)
        if n_matches > 1:
            return "\n".join([self.entries[key] for key in matches])
        elif n_matches == 1:
            return self.entries[matches[0]]
        else:
            return ""
示例#56
0
    def __ior__(self, other):
        # TODO I think there is some crucial error in this OR construction
        # related to fact, that in regex an additional or gets an additional Count,
        # however, such Counts getting empty because another branch was used, should
        # usually not appear in the output, but just get ommitted
        # - more booktracking needed

        # TODO e.g. Delim.join_optional does not seem to work with pyparsing_regex
        # as the OR construction builds new Count(). There is something for this,
        # namely OR-construction with same group-number,
        # however one would have to indicate this here, which is not done in general
        # it seems to be implementation detail for pyparsing-regex unfortunately...
        if isinstance(other, basestring):
            other = ParserElement(regex.escape(other))

        self.structure += other.structure
        self.pattern += "|" + other.pattern
        self.name += "|" + other.name
        self.group(pseudo=True, liftkeys=True, silent=True)
        return self
示例#57
0
    def search_paths(self):
        paths = [self.path]

        path_without_extensions = self.path
        extensions = self.extensions

        for ext in extensions:
            path_without_extensions = re.sub(ext, '', path_without_extensions)

        if '/' not in path_without_extensions:
            paths.append(
                os.path.join(path_without_extensions, 'component.json'))

        if re.sub('|'.join([re.escape(ext) for ext in extensions]), '',
                  os.path.basename(self.path)) != 'index':
            paths.append(
                os.path.join(path_without_extensions,
                             "index%s" % ''.join(extensions)))

        return paths
示例#58
0
def remove_punctuation(text: str, marks=None) -> str:
    """
    Remove punctuation from ``text`` by replacing all instances of ``marks`` with whitespace.

    Args:
        text (str): raw text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
    Returns:
        str: returns a ``str`` object containing normalized text.
    Note:
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
    """
    if marks:
        return re.sub('[{}]+'.format(re.escape(marks)), ' ', text, flags=re.UNICODE)

    return text.translate(PUNCTUATION_TRANSLATE_UNICODE)
示例#59
0
 def __search_reference(self, string):
     string = regex.sub(r'({0})'.format(CiteDetector.year), ' \\1', string)
     elem = regex.split(r'[,\. ]+', string)
     elem = [ regex.escape(e) for e in elem if not CiteDetector.plural.search(e) ] 
     reg = regex.compile('\W.*?'.join(elem), regex.IGNORECASE)
     match = []
     for ref in self.references:
         m = reg.search(ref['text'])
         if m:
             match.append({
                 'cid'  : ref['id'],
                 'start': m.start(),
                 'total': len(m.group())
             })
     if len(match) > 1:
         if regex.search('et[,\. ]+al|others', string):
             match = sorted(match, key=lambda x:(x['start'],-x['total']))
         else:
             match = sorted(match, key=lambda x:(x['start'],x['total']))
     return match
示例#60
0
    def __init__(self, lang):
        self.lang = lang
        self.ntokb = NatLoader.load(lang)

        self.re_saint = r'({})\s'.format(
            '|'.join(self.reEscapeSet(self.getSaintVariants()))
        )  # Regex for all inflections of "Svatý" (or particular language variant)
        self.saint_abb = self.getSaintAbb().strip().rstrip('.')
        escaped_saint_abb = regex.escape(self.saint_abb)
        self.re_saint_abb_only = r'{}\s'.format(
            escaped_saint_abb)  # Regex for "Sv " for example in "Sv Jan"
        self.re_saint_abb_dot = r'{}\.\s?'.format(
            escaped_saint_abb
        )  # Regex for "Sv. " for example in "Sv. Jan" or "Sv." in "Sv.Jan"
        self.re_saint_abbs = r'({}|{})'.format(
            self.re_saint_abb_dot,
            self.re_saint_abb_only)  # common regex for both of previous 2
        self.persons = EntityLoader.load(module='persons',
                                         lang=lang,
                                         initiate='Persons')