def guess_split(majiribun, reading): kanjis=[] matchreg_greedy='' matchreg_nongreedy='' for char in majiribun: if kanji_re.match(char): kanjis.append(char) matchreg_greedy += "(\p{Hiragana}+)" matchreg_nongreedy += "(\p{Hiragana}+?)" else: matchreg_greedy += re.escape(char) matchreg_nongreedy += re.escape(char) m = re.match(matchreg_greedy + '$', reading) if m: yomis = m.groups() yomis_nongreedy = re.match(matchreg_nongreedy + '$', reading).groups() if yomis != yomis_nongreedy: # Ambiguous! return None d = {} for idx in range(0, len(kanjis)): d[kanjis[idx]] = yomis[idx] return(d)
def process_index_delete_in_links(indx, **kwargs): if indx.is_commentary(): pattern = ur'^{} on '.format(re.escape(indx.title)) else: commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title") pattern = ur"(^{} \d)|^({}) on {} \d".format(re.escape(indx.title), "|".join(commentators), re.escape(indx.title)) LinkSet({"refs": {"$regex": pattern}}).delete()
def dep_counts(name): ref_patterns = { 'alone': r'^{} \d'.format(re.escape(name)), 'commentor': r'{} on'.format(re.escape(name)), 'commentee': r'on {} \d'.format(re.escape(name)) } commentee_title_pattern = r'on {}'.format(re.escape(name)) ret = { 'version title exact match': text.VersionSet({"title": name}).count(), 'version title match commentor': text.VersionSet({"title": {"$regex": ref_patterns["commentor"]}}).count(), 'version title match commentee': text.VersionSet({"title": {"$regex": commentee_title_pattern}}).count(), 'history title exact match': history.HistorySet({"title": name}).count(), 'history title match commentor': history.HistorySet({"title": {"$regex": ref_patterns["commentor"]}}).count(), 'history title match commentee': history.HistorySet({"title": {"$regex": commentee_title_pattern}}).count(), } for pname, pattern in ref_patterns.items(): ret.update({ 'note match ' + pname: note.NoteSet({"ref": {"$regex": pattern}}).count(), 'link match ' + pname: link.LinkSet({"refs": {"$regex": pattern}}).count(), 'history refs match ' + pname: history.HistorySet({"ref": {"$regex": pattern}}).count(), 'history new refs match ' + pname: history.HistorySet({"new.refs": {"$regex": pattern}}).count() }) return ret
def __init__(self, charset: Union[Dict[str, Sequence[int]], Sequence[str], str]) -> None: """ Builds a codec converting between graphemes/code points and integer label sequences. charset may either be a string, a list or a dict. In the first case each code point will be assigned a label, in the second case each string in the list will be assigned a label, and in the final case each key string will be mapped to the value sequence of integers. In the first two cases labels will be assigned automatically. As 0 is the blank label in a CTC output layer, output labels and input dictionaries are/should be 1-indexed. Args: charset (unicode, list, dict): Input character set. """ if isinstance(charset, dict): self.c2l = charset else: self.c2l = {k: [v] for v, k in enumerate(sorted(charset), start=1)} # map integer labels to code points because regex only works with strings self.l2c = {} # type: Dict[str, str] for k, v in self.c2l.items(): self.l2c[''.join(chr(c) for c in v)] = k # sort prefixes for c2l regex self.c2l_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.c2l.keys(), key=len, reverse=True))) # sort prefixes for l2c regex self.l2c_regex = regex.compile(r'|'.join(regex.escape(x) for x in sorted(self.l2c.keys(), key=len, reverse=True)))
def __init__(self, keywords, fuzzy_min_len=None): """Initialize search """ if fuzzy_min_len is None: fuzzy_min_len = [] self.fuzzy_min_len = sorted(fuzzy_min_len) self.keywords = {} for i, k in keywords: k = k.strip().lower() if k not in self.keywords: self.keywords[k] = i else: print("ERROR: found duplicate keyword '{0}'".format(k)) print("Number of unique keywords ID to be search: {0}" .format(len(self.keywords))) kw = [] for k in self.keywords: d = self.get_allow_distance(k) if d: kw.append(r'(?:{0}){{e<={1}}}'.format(re.escape(k), d)) else: kw.append(re.escape(k)) re_str = '|'.join(kw) re_str = r'\b(?:{0})\b'.format(re_str) self.re_keywords = re.compile(re_str)
def dep_counts(name): commentators = model.IndexSet({"categories.0": "Commentary"}).distinct("title") ref_patterns = { 'alone': r'^{} \d'.format(re.escape(name)), 'commentor': r'{} on'.format(re.escape(name)), 'commentee': r'^({}) on {} \d'.format("|".join(commentators), re.escape(name)) } commentee_title_pattern = r'^({}) on {} \d'.format("|".join(commentators), re.escape(name)) ret = { 'version title exact match': model.VersionSet({"title": name}).count(), 'version title match commentor': model.VersionSet({"title": {"$regex": ref_patterns["commentor"]}}).count(), 'version title match commentee': model.VersionSet({"title": {"$regex": commentee_title_pattern}}).count(), 'history title exact match': model.HistorySet({"title": name}).count(), 'history title match commentor': model.HistorySet({"title": {"$regex": ref_patterns["commentor"]}}).count(), 'history title match commentee': model.HistorySet({"title": {"$regex": commentee_title_pattern}}).count(), } for pname, pattern in ref_patterns.items(): ret.update({ 'note match ' + pname: model.NoteSet({"ref": {"$regex": pattern}}).count(), 'link match ' + pname: model.LinkSet({"refs": {"$regex": pattern}}).count(), 'history refs match ' + pname: model.HistorySet({"ref": {"$regex": pattern}}).count(), 'history new refs match ' + pname: model.HistorySet({"new.refs": {"$regex": pattern}}).count() }) return ret
def process_index_title_change_in_history(indx, **kwargs): """ Update all history entries which reference 'old' to 'new'. """ if indx.is_commentary(): pattern = r'{} on '.format(re.escape(kwargs["old"])) title_pattern = r'(^{}$)|({} on)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"])) else: commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title") pattern = r"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"])) title_pattern = r'(^{}$)|(^({}) on {})'.format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"])) text_hist = HistorySet({"ref": {"$regex": pattern}}) for h in text_hist: h.ref = h.ref.replace(kwargs["old"], kwargs["new"], 1) h.save() link_hist = HistorySet({"new.refs": {"$regex": pattern}}) for h in link_hist: h.new["refs"] = [r.replace(kwargs["old"], kwargs["new"], 1) for r in h.new["refs"]] h.save() note_hist = HistorySet({"new.ref": {"$regex": pattern}}) for h in note_hist: h.new["ref"] = h.new["ref"].replace(kwargs["old"], kwargs["new"], 1) h.save() title_hist = HistorySet({"title": {"$regex": title_pattern}}) for h in title_hist: h.title = h.title.replace(kwargs["old"], kwargs["new"], 1) h.save()
def __init__(self, keywords, fuzzy_min_len=None): """Initialize search """ if fuzzy_min_len is None: fuzzy_min_len = [] self.fuzzy_min_len = sorted(fuzzy_min_len) self.keywords = {} for i, k in keywords: if i not in self.keywords: self.keywords[i] = [k.strip().lower()] else: self.keywords[i].append(k.strip().lower()) print("Number of unique keywords ID to be search: {0}" .format(len(self.keywords))) self.re_keywords = dict() for i in self.keywords: kw = [] for k in self.keywords[i]: d = self.get_allow_distance(k) if d: kw.append(r'(?:{0}){{e<={1}}}'.format(re.escape(k), d)) else: kw.append(re.escape(k)) re_str = '|'.join(kw) re_str = r'\b(?:{0})\b'.format(re_str) self.re_keywords[i] = re.compile(re_str, flags=re.I)
def SigWritter(uniquename, target, uniquecount, targetname, evalue): targetdict = SeqIO.to_dict(SeqIO.parse(target, 'fasta')) copy(target, uniquename + '.' + str(uniquecount)) handle = open(uniquename, 'a+') if os.path.getsize(uniquename) != 0: mm = mmap(handle.fileno(), 0, access=ACCESS_READ) else: mm = handle for idline in recorddict: pattern = r'([^N]{' + re.escape(str(minLength)) + r',})|([ATCG]{20,}[NATCG]{' \ + re.escape(str(minLength)) + r',900}[ATCG]{20,})' # Find a sequence of at least the target length regex = re.compile(pattern, re.IGNORECASE) uniseq = regex.finditer(recorddict[idline].seq.tostring(), overlapped=True) for coor in uniseq: isunique = True sequence = targetdict[idline].seq[coor.start():coor.end()].tostring() handle.seek(0) for line in handle: if sequence in line: isunique = False if isunique is True: uniquecount += 1 print 'Found Sequence(s) at E-value: ' + str(evalue) handle.write('>usid%04i_%g_%s_%s\n' % (uniquecount, evalue, targetname, idline)) handle.write(sequence + '\n') # else: # global evaluehit # evaluehit = False print 'Writing %i sequence(s) to file' % uniquecount handle.close() return uniquecount
def inline_one(start: str, end: str, nest=Nesting.FRAME, sub=None, display=Display.INLINE): """ """ patt = re.compile(Patterns.single_group.value.format( re.escape(start), re.escape(end))) return inline(patt, escape=[start[0], end[0]], nest=nest, display=display, sub=sub)
def expand_parens(string, parens="()", include_spaces=False, substitute_string=''): output = [] open_paren = re.escape(parens[0]) close_paren = re.escape(parens[1]) substitute_string = re.escape(substitute_string) in_string = re.sub(open_paren + substitute_string, parens[0], string) in_string = re.sub(substitute_string + close_paren, parens[1], in_string) if include_spaces: regex1 = regex2 = re.compile(r'(^.*)' + open_paren + r'(.+)' + close_paren + r'(.*$)') else: regex1 = re.compile(r'(^.*\S)' + open_paren + r'(\S+)' + close_paren + r'(.*$)') regex2 = re.compile(r'(^.*)' + open_paren + r'(\S+)' + close_paren + r'(\S.*$)') re_match1 = regex1.search(in_string) re_match2 = regex2.search(in_string) if re_match1: within = re_match1.group(1) + re_match1.group(2) + re_match1.group(3) without = re_match1.group(1) + re_match1.group(3) elif re_match2: within = re_match2.group(1) + re_match2.group(2) + re_match2.group(3) without = re_match2.group(1) + re_match2.group(3) else: return [string] output = [clean_str(without), clean_str(within)] return output
def process_index_title_change_in_history(indx, **kwargs): print "Cascading History {} to {}".format(kwargs['old'], kwargs['new']) """ Update all history entries which reference 'old' to 'new'. """ from sefaria.model.text import prepare_index_regex_for_dependency_process pattern = prepare_index_regex_for_dependency_process(indx) pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) title_pattern = ur'(^{}$)'.format(re.escape(kwargs["old"])) text_hist = HistorySet({"ref": {"$regex": pattern}}) print "Cascading Text History {} to {}".format(kwargs['old'], kwargs['new']) for h in text_hist: h.ref = h.ref.replace(kwargs["old"], kwargs["new"], 1) h.save() link_hist = HistorySet({"new.refs": {"$regex": pattern}}) print "Cascading Link History {} to {}".format(kwargs['old'], kwargs['new']) for h in link_hist: h.new["refs"] = [r.replace(kwargs["old"], kwargs["new"], 1) for r in h.new["refs"]] h.save() note_hist = HistorySet({"new.ref": {"$regex": pattern}}) print "Cascading Note History {} to {}".format(kwargs['old'], kwargs['new']) for h in note_hist: h.new["ref"] = h.new["ref"].replace(kwargs["old"], kwargs["new"], 1) h.save() title_hist = HistorySet({"title": {"$regex": title_pattern}}) print "Cascading Index History {} to {}".format(kwargs['old'], kwargs['new']) for h in title_hist: h.title = h.title.replace(kwargs["old"], kwargs["new"], 1) h.save()
def _match_by_edit_distance(full_text, text_to_match): text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")") text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}") text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match) try: end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match)) potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)] except: import sys print(full_text) print() print(text_to_match) sys.exit(1) if len(potential_matches) == 0: potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0]), full_text, re.U)] if len(potential_matches) == 0: text_to_match = text_to_match.replace("(", "[") potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0]), full_text, re.U)] potential_matches = [(p[0:p.rindex(text_to_match[-1])+1] if text_to_match[-1] in p and len(p) > len(text_to_match) else p) for p in potential_matches] if len(potential_matches) == 0: # No idea why this would ever happen, but it does return text_to_match match_with_lowest_edit_distance = "" lowest_edit_distance = -1 for match in potential_matches: e_d = edit_distance(match, text_to_match) if lowest_edit_distance == -1 or e_d <= lowest_edit_distance: lowest_edit_distance = e_d match_with_lowest_edit_distance = match result = match_with_lowest_edit_distance.strip() if text_to_match[-1] in result: while result[-1] != text_to_match[-1]: result = result[0:-1] elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result): while result[-1] not in ['"', '”', "\u201d"]: result = result[0:-1] elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..": while result[-1] != text_to_match[-1]: result += full_text[full_text.index(result) + len(result)][-1] return result
def process_index_title_change_in_links(indx, **kwargs): if indx.is_commentary(): pattern = r'^{} on '.format(re.escape(kwargs["old"])) else: commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title") pattern = r"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"])) #pattern = r'(^{} \d)|( on {} \d)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"])) links = LinkSet({"refs": {"$regex": pattern}}) for l in links: l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(pattern, r) else r for r in l.refs] l.save()
def process_index_title_change_in_notes(indx, **kwargs): print "Cascading Notes {} to {}".format(kwargs['old'], kwargs['new']) pattern = Ref(indx.title).regex() pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) notes = NoteSet({"ref": {"$regex": pattern}}) for n in notes: try: n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1) n.save() except Exception: logger.warning("Deleting note that failed to save: {}".format(n.ref)) n.delete()
def process_index_title_change_in_links(indx, **kwargs): print "Cascading Links {} to {}".format(kwargs['old'], kwargs['new']) pattern = text.Ref(indx.title).regex() pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) links = LinkSet({"refs": {"$regex": pattern}}) for l in links: l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(pattern, r) else r for r in l.refs] try: l.save() except InputError: #todo: this belongs in a better place - perhaps in abstract logger.warning("Deleting link that failed to save: {} - {}".format(l.refs[0], l.refs[1])) l.delete()
def process_index_title_change_in_notes(indx, **kwargs): if indx.is_commentary(): pattern = r'{} on '.format(re.escape(kwargs["old"])) else: commentators = IndexSet({"categories.0": "Commentary"}).distinct("title") pattern = r"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"])) notes = NoteSet({"ref": {"$regex": pattern}}) for n in notes: try: n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1) n.save() except Exception: pass #todo: log me, and wrap other handlers in try/catch
def process_index_title_change_in_links(indx, **kwargs): print "Cascading Links {} to {}".format(kwargs['old'], kwargs['new']) patterns = [pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) for pattern in text.Ref(indx.title).regex(as_list=True)] queries = [{'refs': {'$regex': pattern}} for pattern in patterns] links = LinkSet({"$or": queries}) for l in links: l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(u'|'.join(patterns), r) else r for r in l.refs] try: l.save() except InputError: #todo: this belongs in a better place - perhaps in abstract logger.warning("Deleting link that failed to save: {} - {}".format(l.refs[0], l.refs[1])) l.delete()
def __init__(self, leading_allow=None, trailing_allow=None): """ :param list leading_allow: The leading punctuation characters to allow. :param list trailing_allow: The trailing punctuation characters to allow. """ leading_pattern = "" if not leading_allow else r"[%s]*" % regex.escape("".join(leading_allow)) trailing_pattern = "" if not trailing_allow else r"[%s]" % regex.escape("".join(trailing_allow)) if trailing_pattern: super(OuterPunctuationFilter, self).__init__( "%s[^\W_]+(?:$|.*[^\W_]%s*|%s*)" % (leading_pattern, trailing_pattern, trailing_pattern) ) else: super(OuterPunctuationFilter, self).__init__("%s[^\W_](?:$|.*[^\W_])" % leading_pattern)
def process_index_title_change_in_links(indx, **kwargs): if indx.is_commentary(): pattern = r'^{} on '.format(re.escape(kwargs["old"])) else: commentators = text.IndexSet({"categories.0": "Commentary"}).distinct("title") pattern = ur"(^{} \d)|(^({}) on {} \d)".format(re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"])) #pattern = r'(^{} \d)|( on {} \d)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"])) links = LinkSet({"refs": {"$regex": pattern}}) for l in links: l.refs = [r.replace(kwargs["old"], kwargs["new"], 1) if re.search(pattern, r) else r for r in l.refs] try: l.save() except InputError: #todo: this belongs in a better place - perhaps in abstract logger.warning("Deleting link that failed to save: {} {}".format(l.refs[0], l.refs[1])) l.delete()
def match(pat=None, pat_args=None): """Generates a regular expression to match a dictionary entry. In pat, '$foo' matches the contents of pat_args['foo'] or nothing, if pat_args['foo'] exists. If it doesn't exist, it matches /.*?/. In pat_args['foo'], '$foo' matches /.*?/, and '$bar' is expanded as it would be in pat. Args: pat: (Optional) The format of the entry. Defaults to '$word$pron$pos$cl$de'. pat_args: (Optional) The expansions for variables. If pat is unspecified, defaults to { 'pron': '/$pron/', 'pos': ' - $pos', 'cl': ' ($cl$subcl)', 'subcl': '.$subcl', 'de': ': $de' }, otherwise, defaults to {} Returns: A regular expression which will match a dictionary entry in the specified format. Fields mentioned in the pattern with '$' can be accessed as named capture groups of the match object. """ if pat is None: pat = default_pat if pat_args is None: pat_args = default_pat_args pat_args = pat_args or {} args = {} for f in pat_args: args[f] = regex.escape(pat_args[f], True).replace(r'\$', '$') m = regex.search(var_matcher(f), args[f]) while m is not None: sp = (m.start(1) - 1, m.end(1)) args[f] = workers.slice_replace(args[f], sp, var_group(f)) m = regex.search(var_matcher(f), args[f]) pat = '^' + regex.escape(pat, True).replace(r'\$', '$') + '$' m = var_match.search(pat) while m is not None: sp = (m.start(1) -1, m.end(1)) f = m.group(1) pat = workers.slice_replace(pat, sp, '({})?'.format(args.get(f, var_group(f)))) m = var_match.search(pat) pat = pat.replace(' ', r'\s+') return regex.compile(pat)
def constructErrorRegex( Error, Context ): LEFTCONTEXT = 0 RIGHTCONTEXT = 1 LeftContext = regex.escape( Context[ LEFTCONTEXT ].lstrip( ) ) RightContext = regex.escape( Context[ RIGHTCONTEXT ].rstrip( ) ) minMaxLen = r"{" + str( int( len( Error ) * (1 / 2) ) ) + r"," + str( math.ceil( len( Error ) * 1.8 ) ) + r"}" fullPattern = r"(?:(?:" + LeftContext + r"){1s+1i+1d<=7})(?=." + minMaxLen + r"(?:(?:" + RightContext + "){" \ "1s+1i+1d<=6}))" + \ r"(?P<errorMatch>(?:\w++[\-\']?\w*+)(?:(?=(?:" + RightContext + r"){1s+1i+1d<=6})|\W{1,2}))+?" + \ r"(?:(?:" + RightContext + "){1s+1i+2d<=6})" return regex.compile( fullPattern, regex.BESTMATCH | regex.V1 )
def _escape(self, match): '''Escape matched 'other' group value.''' groups = match.groupdict() if groups['other'] is not None: return _regex.escape(groups['other']) return groups['placeholder']
def read_regex(path): path = ensure_path(path) with path.open() as file_: entries = file_.read().split('\n') expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return re.compile(expression)
def __init__(self, n, initial_count, true_words, feature_chart, hypothetical_phonotactics=False, score_combiner=lambda scores: reduce(mul, scores), subseq_counts=None, diphthongs=None): self._vowels = feature_chart.phones_for_features("+syllabic") self._vowel_re = re.compile("[" + ''.join([re.escape(vowel) for vowel in self._vowels if len(vowel) == 1]) + "]") self._diphthongs = set(diphthongs) if diphthongs is not None else set() super(SyllableNgramCue, self).__init__(n, initial_count, len(set(chain(*[self.syllabify(word) for word in true_words]))), hypothetical_phonotactics=hypothetical_phonotactics, score_combiner=score_combiner, subseq_counts=subseq_counts)
def get_pattern(cls): patterns = [] for alternative in cls.alternatives: pattern = regex.escape(alternative) if regex.fullmatch(ur'\w', alternative[-1]): pattern += ur'\b' patterns.append(pattern)
def __init__(self, test_file, temp_dir, cache=False): self.tic = time.time() self.test_file = test_file self.temp_dir = temp_dir self.cache = cache self.failed = False self.lines = [] clean_test_file_name = re.sub('^' + re.escape(test_root('data/')), '', test_file) self.say('{}', test_started(clean_test_file_name)) self.say("Testing {}...", bold(clean_test_file_name)) self.style = self._get_style() sh.mkdir('-p', fail_path(self.style)) self.style_args = Test._get_style_options(self.style) if self.style_args: self.say("\tstyling: {}", shell_join(self.style_args)) self.bib_args = Test._get_bib_options(test_file) if self.bib_args: self.say("\tbibliography: {}", self.bib_args) self.options = self.style_args + self.bib_args self.test_name = os.path.join(self.style, os.path.basename(test_file)) self.test_out = os.path.join(self.temp_dir, self.test_name) self.test_err = self.test_out + '.err' _, ext = os.path.splitext(test_file) self.test_new = self.test_out + '.new.' + ext
def add_spaces(text, exclude=None): if exclude: patt_exclude = regex.escape(exclude) patt_eng_cjk = regex.compile(u"([[%s]--%s])([%s])" % (CHAR_ENG_LEFT, patt_exclude, CHAR_CJK)) patt_cjk_eng = regex.compile(u"([%s])([[%s]--%s])" % (CHAR_CJK, CHAR_ENG_RIGHT, patt_exclude)) else: patt_eng_cjk = PATTERN_ENG_CJK patt_cjk_eng = PATTERN_CJK_ENG def add_space_func(index1, index2): def add_space(match): return u"%s %s" % (match.group(index1), match.group(index2)) return add_space text = patt_cjk_eng.subn(add_space_func(1, 2), text)[0] text = patt_eng_cjk.subn(add_space_func(1, 2), text)[0] if not (exclude and '"' in exclude): # XXX"YYY"XXX -> XXX "YYY" XXX # where X and Y are CJK charaters is_left_dquote = True is_left_squote = True out = StringIO.StringIO() for i in xrange(len(text)): prev_char = text[i - 1] if i > 0 else None cur_char = text[i] next_char = text[i + 1] if i < len(text) - 1 else None if cur_char == u'"': if is_left_dquote: if _is_cjk(prev_char): out.write(u' "') else: out.write(u'"') is_left_dquote = False else: if _is_cjk(next_char): out.write(u'" ') else: out.write(u'"') is_left_dquote = True elif cur_char == u"'": if is_left_squote: if _is_cjk(prev_char): out.write(u" '") else: out.write(u"'") is_left_squote = False else: if _is_cjk(next_char): out.write(u"' ") else: out.write(u"'") is_left_squote = True else: out.write(cur_char) text = out.getvalue() out.close() return text
def find_next_location(self, entry): from calibre.gui2.tweak_book.boss import get_boss boss = get_boss() if boss is None: return files = entry.usage current_editor_name = boss.currently_editing if current_editor_name not in files: current_editor_name = None else: idx = files.index(current_editor_name) before, after = files[:idx], files[idx+1:] files = [current_editor_name] + after + before + [current_editor_name] pat = regex.compile(regex.escape(entry.char)) for file_name in files: from_cursor = False if file_name == current_editor_name: from_cursor = True current_editor_name = None ed = boss.edit_file_requested(file_name) if ed is None: return if ed.editor.find(pat, complete=not from_cursor): boss.show_editor(file_name) return True return False
def compile_prefix_regex(entries): if '(' in entries: # Handle deprecated data expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return re.compile(expression) else: expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) return re.compile(expression)
def hightlight_keywords(self, text, keywords, light_color='#ffea593d', deep_color='#ffc107'): hightlighted_html = '' all_hightlights = [] tokens = self.full_tokenize(text) for t in tokens: t['background_color'] = [] for w in keywords: matches = regex.finditer(r'\b{}\b'.format( regex.escape(w, special_only=True)), text, flags=regex.IGNORECASE) all_hightlights.extend([{ 'start': m.start(), 'end': m.end(), 'text': m.group(), } for m in matches]) all_hightlights = sorted(all_hightlights, key=lambda x: x['start']) for h in all_hightlights: for t in tokens: if (t['start'] >= h['start'] and t['end'] <= h['end']): t['background_color'].append(light_color) for t in tokens: color_len = len(t['background_color']) if color_len == 0: hightlighted_html += t['text'] elif color_len == 1: hightlighted_html += \ '<span style="background-color:{background_color};">{text}</span>'.format( background_color=t['background_color'][0], text=t['text'] ) else: hightlighted_html += \ '<span style="background-color:{background_color};">{text}</span>'.format( background_color=deep_color, text=t['text'] ) return hightlighted_html
def test_set_redirect_target(self): """Test set_redirect_target method.""" # R1 redirects to R2 and R3 doesn't exist. site = self.get_site('en') p1 = pywikibot.Page(site, 'User:Legoktm/R2') p2 = pywikibot.Page(site, 'User:Legoktm/R1') p3 = pywikibot.Page(site, 'User:Legoktm/R3') text = p2.get(get_redirect=True) with self.assertRaisesRegex( IsNotRedirectPageError, r'{} is not a redirect page\.'.format(re.escape(str(p1)))): p1.set_redirect_target(p2) with self.assertRaisesRegex(NoPageError, NO_PAGE_RE): p3.set_redirect_target(p2) p2.set_redirect_target(p1, save=False) self.assertEqual(text, p2.get(get_redirect=True))
def go_to_anchor(self, anchor): if anchor is TOP: c = self.textCursor() c.movePosition(c.Start) self.setTextCursor(c) return True base = r'''%%s\s*=\s*['"]{0,1}%s''' % regex.escape(anchor) raw = unicode(self.toPlainText()) m = regex.search(base % 'id', raw) if m is None: m = regex.search(base % 'name', raw) if m is not None: c = self.textCursor() c.setPosition(m.start()) self.setTextCursor(c) return True return False
def find_sites_that_may_have_removed_linker(last_linker_activity_day=20): """ Checks for each site whether there has been a webpage hit with the linker in the last `last_linker_activity_day` days Prints an alert for each site that doesn't meet this criterion """ sites_to_delete = {} sites_to_keep = {} from datetime import datetime, timedelta last_active_threshold = datetime.today() - timedelta( days=last_linker_activity_day) webpages_without_websites = 0 for data in get_website_cache(): if data["is_whitelisted"]: # we only care about whitelisted sites for domain in data['domains']: ws = WebPageSet({"url": { "$regex": re.escape(domain) }}, limit=1, sort=[['lastUpdated', -1]]) keep = True if ws.count() == 0: sites_to_delete[domain] = f"{domain} has no pages" keep = False else: webpage = ws[0] # lastUpdated webpage for this domain website = webpage.get_website() if website: website.linker_installed = webpage.lastUpdated > last_active_threshold if not website.linker_installed: keep = False print(f"Alert! {domain} has removed the linker!") sites_to_delete[ domain] = f"{domain} has {website.num_webpages} pages, but has not used the linker in {last_linker_activity_day} days. {webpage.url} is the oldest page." else: print( "Alert! Can't find website {} corresponding to webpage {}" .format(data["name"], webpage.url)) webpages_without_websites += 1 continue if keep: assert domain not in sites_to_delete sites_to_keep[domain] = True if webpages_without_websites > 0: print("Found {} webpages without websites".format( webpages_without_websites)) return sites_to_delete
def check_path(self, url): self.logger.info(f'Checking for url {url}') blacklisted_paths = ['/', '/index.php', None, ''] vulnsss = [] places = ["true_vulns", "almost_true", "probable_vulns"] for place in places: vulnsss.extend(self.vulns[place]) if url not in blacklisted_paths: for doc in self.collection.find( {"URI": { '$regex': regex.escape(url) }}): vuln = doc.get('Vulnerability') if vuln not in vulnsss: self.vulns["possible_vulns"].append(vuln)
def _update_processed_text(self, original_number_list): """ Replaces detected date with tag generated from entity_name used to initialize the object with A final string with all dates replaced will be stored in object's tagged_text attribute A string with all dates removed will be stored in object's processed_text attribute Args: original_number_list (list): list of substrings of original text to be replaced with tag created from entity_name """ for detected_text in original_number_list: _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format( re.escape(detected_text)), flags=_re_flags) self.tagged_text = _pattern.sub(self.tag, self.tagged_text, 1) self.processed_text = _pattern.sub('', self.processed_text, 1)
def cleanData(self, text): clean = bytes(text.lower().encode('utf-8')) clean = clean.replace(b"\n", b" ") clean = clean.replace(b"\t", b" ") clean = clean.replace(b"\b", b" ") clean = clean.replace(b"\r", b" ") exclude = r.compile( b'[%s]' % re.escape(bytes(string.punctuation.encode('utf-8')))) clean = b" ".join([exclude.sub(b'', token) for token in clean.split()]) clean = r.sub(b"\d+", b" ", clean) clean = r.sub(b'\\s+', b' ', clean) clean = r.sub(b'\s+', b' ', clean) clean = r.sub(b'\s+$', b'', clean) clean = r.sub(b'\s*', b'', clean) return str(clean).encode('utf-8')
def regexizeTagGlob(tag): ''' Returns: a regular expression string with ** and * interpreted as tag globs Precondition: tag is a valid tagmatch Notes: A single asterisk will replace exactly one dot-delimited component of a tag A double asterisk will replace one or more of any character. The returned string does not contain a starting '^' or trailing '$'. ''' return ReRegex.sub( lambda m: r'([^.]+?)' if m.group(1) is None else r'(.+)', regex.escape(tag))
def multiple_replace(string, rep_dict): """ Replace multiple string patterns simultaneously. Args: string: The string to be replaced. rep_dict: Dictionary containing key and values as patterns that should be replaced. Returns: On success: The string with all the patterns replaced. On failure: False. """ pattern = re.compile( "|".join([re.escape(k) for k in rep_dict.keys()]), re.M) return pattern.sub(lambda x: rep_dict[x.group(0)], string)
def _process_regex_dict(regex_dict, regex_escape=False, **kwargs): regex_pattern_list = [] for key, val in regex_dict.items(): new_val = list() for item in val: if regex_escape: new_val.append(regex.escape(item)) else: new_val.append(item) # regex_dict[key] = new_val regex_pattern_list += new_val #end for emoticon_regex_pattern = '|'.join(regex_pattern_list) # return '(' + '|'.join(regex_pattern_list) + r')*[.?!]\s*' return '(((' + emoticon_regex_pattern + r')\s*)*([.!?]+\s+))|(((' + emoticon_regex_pattern + r')\s*)+([.!?]*\s+))'
def check_urls(index, question, answer): global full_sentence_valid_url full_sentence_valid_url = False valid_url = False # Disabled if score_settings['incorrect_url_modifier_value'] is None: return 0 # Find all utls in sentence for url in re.finditer( 'http(?:s?):(//([^/]*?)/(?:[^ ])*?(?=$|[' + re.escape(score_settings['url_delimiters']) + ']))?', answer): # Check if result is in cache already and return it if url_cache[url.group(0)][1] > time.time(): if url_cache[url.group(0)][0] == 0: return score_settings['incorrect_url_modifier_value'] # Url not in cache - check it else: # Send HEAD request and check HTTP response code try: request = requests.head(url.group(0)) code = request.status_code except Exception as e: code = 0 # Add to cache url_cache[url.group(0)] = [ 1 if code == 200 else 0, time.time() + 86400 ] # If code is diffrent than 200 - return modifier value if code != 200: return score_settings['incorrect_url_modifier_value'] # Check if it's full sentence url valid_url = (len(url.group(0)) == len(answer)) # Everyting ok, set if full sentence url and return 0 full_sentence_valid_url = valid_url return 0
def process_index_title_change_in_history(indx, **kwargs): print "Cascading History {} to {}".format(kwargs['old'], kwargs['new']) """ Update all history entries which reference 'old' to 'new'. """ if indx.is_commentary(): pattern = ur'{} on '.format(re.escape(kwargs["old"])) title_pattern = ur'(^{}$)|({} on)'.format(re.escape(kwargs["old"]), re.escape(kwargs["old"])) else: pattern = text.Ref(indx.title).base_text_and_commentary_regex() pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) commentators = text.library.get_commentary_version_titles_on_book( kwargs["old"], with_commentary2=True) title_pattern = ur'(^{}$)|(^({}) on {}$)'.format( re.escape(kwargs["old"]), "|".join(commentators), re.escape(kwargs["old"])) text_hist = HistorySet({"ref": {"$regex": pattern}}) print "Cascading Text History {} to {}".format(kwargs['old'], kwargs['new']) for h in text_hist: h.ref = h.ref.replace(kwargs["old"], kwargs["new"], 1) h.save() link_hist = HistorySet({"new.refs": {"$regex": pattern}}) print "Cascading Link History {} to {}".format(kwargs['old'], kwargs['new']) for h in link_hist: h.new["refs"] = [ r.replace(kwargs["old"], kwargs["new"], 1) for r in h.new["refs"] ] h.save() note_hist = HistorySet({"new.ref": {"$regex": pattern}}) print "Cascading Note History {} to {}".format(kwargs['old'], kwargs['new']) for h in note_hist: h.new["ref"] = h.new["ref"].replace(kwargs["old"], kwargs["new"], 1) h.save() title_hist = HistorySet({"title": {"$regex": title_pattern}}) print "Cascading Index History {} to {}".format(kwargs['old'], kwargs['new']) for h in title_hist: h.title = h.title.replace(kwargs["old"], kwargs["new"], 1) h.save()
def full_regex(self, title, lang, anchored=True, compiled=True, **kwargs): """ :return: Regex object. If for_js == True, returns the Regex string :param for_js: Defaults to False :param match_range: Defaults to False A call to `full_regex("Bereishit", "en", for_js=True)` returns the follow regex, expanded here for clarity : ``` Bereishit # title [,.: \r\n]+ # a separator (self.after_title_delimiter_re) (?: # Either: (?: # 1) (\d+) # Digits ( # and maybe [,.: \r\n]+ # a separator (\d+) # and more digits )? ) | # Or: (?: # 2: The same [[({] # With beginning (\d+) ( [,.: \r\n]+ (\d+) )? [])}] # and ending brackets or parens or braces around the numeric portion ) ) (?= # and then either [.,;?! })<] # some kind of delimiting character coming after | # or $ # the end of the string ) ``` Different address type / language combinations produce different internal regexes in the innermost portions of the above, where the comments say 'digits'. """ reg = ur"^" if anchored else "" reg += regex.escape(title) + self.after_title_delimiter_re addr_regex = self.address_regex(lang, **kwargs) reg += ur'(?:(?:' + addr_regex + ur')|(?:[\[({]' + addr_regex + ur'[\])}]))' # Match expressions with internal parenthesis around the address portion reg += ur"(?=\W|$)" if not kwargs.get( "for_js") else ur"(?=[.,:;?! })\]<]|$)" return regex.compile(reg, regex.VERBOSE) if compiled else reg
def safe_replace(original, to_replace, replacement): """ Utility that will replace the string except in the HTML tag attributes :param original: original string :param to_replace: string to replace :param replacement: replacement string :return: new string with the replacement done """ def _replace(match): if match.group(1): return match.group(0) else: return replacement replace_regex = re.escape(to_replace.replace("\\", "")) in_attr = r'((?:<[^<>]*?"[^<>]*?){1}' + replace_regex + r'(?:[^<>]*?"[^<>]*?>){1})' regex = in_attr + r"|(" + replace_regex + r")" return re.sub(regex, _replace, original)
def __convertterminalvalue(self,prefix,value,**kwargs): """Internal method converts "terminal values" to characters Parameters: prefix -> The prefix (like x for hex, b for binary, d for decimal) value -> The actual number (without the prefix and the %) Returns: char -> The character representation of the terminalvalue \ as string (escaped if needed) """ if prefix == "b": char=b2a_qp(binvalue) if prefix == 'd': char=chr(charater) if prefix == 'x': print ("Value:", value) char=binascii.unhexlify(value).decode() return regex.escape(char)
def clean_article(self, art): # remove unneccesary fields in the title like newspaper name etc. titles = art['title'].split('|') art['title'] = max(titles, key=len) # remove title from the body content # boilerpipe tends to include the title also in the body content pattern = regex.compile('({}){{e<=5}}'.format( regex.escape(art['title']))) match = pattern.search(art['body']) if match: end_idx = match.span()[1] art['body'] = art['body'][end_idx:] art['title'] = self._strip_txt(art['title']) art['body'] = self._strip_txt(art['body']) return art
def text2sentences(path): ''' Converts a raw text from path to tokenized sentences ''' concat_text = text_concat(path) tok_sentences = [] # Split at each symbol that ends a sentence e.g. '.?!' sents = re.split(r'[.?!]', concat_text) # Get rid of punctuation punct = re.compile('[%s]' % re.escape(string.punctuation)) # Tokenize for phrase in sents: no_punct = punct.sub('', phrase) tok_sentences.append(no_punct.lower().split()) #return undersample_sentences(tok_sentences) return tok_sentences #comment this line & uncomment the one above if we want to use the undersampling
def xmatch(lookup_value, lookup_array, match_type=1): res = [Error.errors['#N/A']] t_id = _get_type_id(lookup_value) if match_type > 0: def check(j, x, val, r): if x <= val: r[0] = j return x == val and j > 1 return j > 1 elif match_type < 0: def check(j, x, val, r): if x < val: return True r[0] = j return v == val else: t_id = _get_type_id(lookup_value) if t_id == 1: def sub(m): return {'\\': '', '?': '.', '*': '.*'}[m.groups()[0]] match = regex.compile(r'^%s$' % regex.sub( r'(?<!\\\~)\\(?P<sub>[\*\?])|(?P<sub>\\)\~(?=\\[\*\?])', sub, regex.escape(lookup_value) ), regex.IGNORECASE).match else: match = lambda x: x == lookup_value # noinspection PyUnusedLocal def check(j, x, val, r): if match(x): r[0] = j convert = lambda x: x if t_id == 1: convert = lambda x: x.upper() lookup_value = convert(lookup_value) for i, v in _yield_vals(t_id, lookup_array): if check(i, convert(v), lookup_value, res): break return res[0]
def merge_vocab(pair: Tuple[str, str], input_vocab: Dict[str, int]) -> Tuple[Dict[str, int], List]: """ >>> pair = ('w', 'o') >>> input_vocab = {'b i r d @': 3, 'w o r d @': 7, 'w o g @': 13} >>> new_vocab, new_pairs = merge_vocab(pair, input_vocab) >>> new_vocab {'b i r d @': 3, 'wo r d @': 7, 'wo g @': 13} >>> new_pairs [(('wo', 'r'), 7), (('o', 'r'), -7), (('wo', 'g'), 13), (('o', 'g'), -13)] """ output_vocab = {} concat_pair_with_space = ' '.join(pair) concat_pair_with_space_escaped = regex.escape(concat_pair_with_space) concat_pair = ''.join(pair) reg = regex.compile('(^|[^ ]+ )(' + concat_pair_with_space_escaped + ')( [^ ]+|$)') added_pairs = [] for word in input_vocab: word_occurences = input_vocab[word] match = reg.search(word) while match: # word changed if match.group(1) != '': subtoken_before = match.group(1)[:-1] added_pairs.append( ((subtoken_before, concat_pair), word_occurences)) if pair != (subtoken_before, pair[0]): added_pairs.append( ((subtoken_before, pair[0]), -word_occurences)) if match.group(3) != '': subtoken_after = match.group(3)[1:] added_pairs.append( ((concat_pair, subtoken_after), word_occurences)) if pair != (pair[1], subtoken_after): added_pairs.append( ((pair[1], subtoken_after), -word_occurences)) start, end = match.span(2) replacement = concat_pair word = word[:start] + replacement + word[end:] match = reg.search(word) output_vocab[word] = word_occurences return output_vocab, added_pairs
def __init__(self): self.window = 5 self.entity_prior = {} self.me_prob = {} self.mention_cand = {} self.m_count = {} self.punc = re.compile('[%s]' % re.escape(string.punctuation)) self.log_file = '' self.total_p = 0 self.total_tp = 0 self.doc_actual = 0 self.mention_actual = 0 self.total_cand_num = 0 self.miss_senses = set() self.gamma = 0.1 # to smooth the pem self.is_local = False self.is_global = False self.is_prior = False self.input_path = ''
def jump_to_location(loc): from calibre.gui2.tweak_book.boss import get_boss boss = get_boss() if boss is None: return name = loc.name editor = boss.edit_file_requested(name) if editor is None: return editor = editor.editor if loc.line_number is not None: block = editor.document().findBlockByNumber(loc.line_number - 1) # blockNumber() is zero based if not block.isValid(): return c = editor.textCursor() c.setPosition(block.position(), c.MoveAnchor) editor.setTextCursor(c) if loc.text_on_line is not None: editor.find(regex.compile(regex.escape(loc.text_on_line)))
def decorate(function): global TARGET_GENERATORS tre = re.escape(output.format(iter=DC1, next=DC2)) tre = tre.replace(DC1, r"(?P<iter>[0-9]+)") tre = tre.replace(DC2, r"(?P<next>[0-9]+)") def gen_target(name_match, stuff): inputs, params, function, flags = stuff try: ival = int(name_match.group("iter")) except IndexError: ival = None try: nval = int(name_match.group("next")) except IndexError: nval = None if ival == None and nval != None: if nval <= 0: ival = "start" else: ival = nval - 1 elif ival != None and nval == None: nval = ival + 1 elif ival == None or nval == None: ival = "start" nval = 0 inputs = [inp.format(iter=ival, next=nval) for inp in inputs] params = [param.format(iter=ival, next=nval) for param in params] def wrapped(*args, **kwargs): nonlocal function, nval return function(nval, *args, **kwargs) wrapped.__name__ = function.__name__ return inputs, params, wrapped, flags TARGET_GENERATORS[tre] = (gen_target, (inputs, params, function, flags)) return function
def mask_by_term(self, orig_source, orig_target: Optional[str] = None, prob=1.0): """ Masks using dictionary entries. """ source_masks = [] source = orig_source target = orig_target for term, (translation, label) in self.terms.items(): if term in source: pattern = r'\b{}\b'.format(re.escape(term)) source, target, term_masks = self.get_label_masks( label, pattern, translation, source, target, prob) source_masks.extend(term_masks) return source, target, source_masks
def get_search_regex(state): raw = state['find'] if state['mode'] != 'regex': raw = regex.escape(raw, special_only=True) flags = REGEX_FLAGS if not state['case_sensitive']: flags |= regex.IGNORECASE if state['mode'] == 'regex' and state['dot_all']: flags |= regex.DOTALL if state['direction'] == 'up': flags |= regex.REVERSE ans = regex_cache.get((flags, raw), None) if ans is None: try: ans = regex_cache[(flags, raw)] = regex.compile(raw, flags=flags) except regex.error as e: raise InvalidRegex(raw, e) return ans
def lookup(self, lemma: str) -> str: """Perform match of a lemma against headwords. If more than one match, then return the concatenated entries. For example: >>> from cltk.lexicon.lat import LatinLewisLexicon >>> lll = LatinLewisLexicon(interactive=False) >>> lll.lookup("clemens")[:50] 'clēmēns entis (abl. -tī; rarely -te, L.), adj. wit' >>> all(word in lll.lookup("levis") for word in ["levis","lēvis"]) # Test for concatenated entries True >>> lll.lookup("omnia") '' >>> lll.lookup(".") '' >>> lll.lookup("123") '' >>> lll.lookup("175.") '' >>> lll.lookup("(") # Test for regex special character '' """ if not self.entries: raise CLTKException( "No lexicon entries found in the .yaml file. This should never happen." ) if regex.match(r"^[0-9\.\?,\:;\!\<\>\-]*$", lemma) is not None: return "" lemma = regex.escape(lemma.lower()) keys = self.entries.keys() matches = [ key for key in keys if regex.match(rf"^{lemma}[0-9]?$", key) ] n_matches = len(matches) if n_matches > 1: return "\n".join([self.entries[key] for key in matches]) elif n_matches == 1: return self.entries[matches[0]] else: return ""
def __ior__(self, other): # TODO I think there is some crucial error in this OR construction # related to fact, that in regex an additional or gets an additional Count, # however, such Counts getting empty because another branch was used, should # usually not appear in the output, but just get ommitted # - more booktracking needed # TODO e.g. Delim.join_optional does not seem to work with pyparsing_regex # as the OR construction builds new Count(). There is something for this, # namely OR-construction with same group-number, # however one would have to indicate this here, which is not done in general # it seems to be implementation detail for pyparsing-regex unfortunately... if isinstance(other, basestring): other = ParserElement(regex.escape(other)) self.structure += other.structure self.pattern += "|" + other.pattern self.name += "|" + other.name self.group(pseudo=True, liftkeys=True, silent=True) return self
def search_paths(self): paths = [self.path] path_without_extensions = self.path extensions = self.extensions for ext in extensions: path_without_extensions = re.sub(ext, '', path_without_extensions) if '/' not in path_without_extensions: paths.append( os.path.join(path_without_extensions, 'component.json')) if re.sub('|'.join([re.escape(ext) for ext in extensions]), '', os.path.basename(self.path)) != 'index': paths.append( os.path.join(path_without_extensions, "index%s" % ''.join(extensions))) return paths
def remove_punctuation(text: str, marks=None) -> str: """ Remove punctuation from ``text`` by replacing all instances of ``marks`` with whitespace. Args: text (str): raw text marks (str): If specified, remove only the characters in this string, e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. Otherwise, all punctuation marks are removed. Returns: str: returns a ``str`` object containing normalized text. Note: When ``marks=None``, Python's built-in :meth:`str.translate()` is used to remove punctuation; otherwise, a regular expression is used instead. The former's performance is about 5-10x faster. """ if marks: return re.sub('[{}]+'.format(re.escape(marks)), ' ', text, flags=re.UNICODE) return text.translate(PUNCTUATION_TRANSLATE_UNICODE)
def __search_reference(self, string): string = regex.sub(r'({0})'.format(CiteDetector.year), ' \\1', string) elem = regex.split(r'[,\. ]+', string) elem = [ regex.escape(e) for e in elem if not CiteDetector.plural.search(e) ] reg = regex.compile('\W.*?'.join(elem), regex.IGNORECASE) match = [] for ref in self.references: m = reg.search(ref['text']) if m: match.append({ 'cid' : ref['id'], 'start': m.start(), 'total': len(m.group()) }) if len(match) > 1: if regex.search('et[,\. ]+al|others', string): match = sorted(match, key=lambda x:(x['start'],-x['total'])) else: match = sorted(match, key=lambda x:(x['start'],x['total'])) return match
def __init__(self, lang): self.lang = lang self.ntokb = NatLoader.load(lang) self.re_saint = r'({})\s'.format( '|'.join(self.reEscapeSet(self.getSaintVariants())) ) # Regex for all inflections of "Svatý" (or particular language variant) self.saint_abb = self.getSaintAbb().strip().rstrip('.') escaped_saint_abb = regex.escape(self.saint_abb) self.re_saint_abb_only = r'{}\s'.format( escaped_saint_abb) # Regex for "Sv " for example in "Sv Jan" self.re_saint_abb_dot = r'{}\.\s?'.format( escaped_saint_abb ) # Regex for "Sv. " for example in "Sv. Jan" or "Sv." in "Sv.Jan" self.re_saint_abbs = r'({}|{})'.format( self.re_saint_abb_dot, self.re_saint_abb_only) # common regex for both of previous 2 self.persons = EntityLoader.load(module='persons', lang=lang, initiate='Persons')