def __init__(self): """\ Constructor (pre-compile all needed regexes). """ # compile regexes self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) self._contract = Regex(r" (\p{Alpha}+) ' ?(ll|ve|re|[dsmt])(?= )", flags=UNICODE|IGNORECASE) self._fixes = Regex(r" (do|go[nt]|wan) (n't|ta|na)(?= )", flags=UNICODE|IGNORECASE) self._replace_table = {' i ':' I ', ' im ': ' I\'m ', ' dont ': ' don\'t '}
def match_begin_end_env(env='equation', get_content=True): '''matchs \begin{equation*} something ... \end{equation} One special option is env='anything', will match all \begin{}*\end{} ''' er = Regex() er.add(r"\\begin{") er.add(er.zero_or_more(er.whitespace())) if env == 'anything': env = er.non_greedy(er.zero_or_more(er.anything())) er.add(env) er.add(er.zero_or_more(er.whitespace())) er.add(er.zero_or_one(r'\*')) er.add(er.zero_or_more(er.whitespace())) er.add(r"}") if get_content: er.add(er.group_begin(name="content")) er.add(er.non_greedy(er.zero_or_more(er.anything()))) if get_content: er.add(er.group_end()) er.add(r"\\end{") er.add(er.zero_or_more(er.whitespace())) er.add(env) er.add(er.zero_or_more(er.whitespace())) er.add(er.zero_or_one(r'\*')) er.add(er.zero_or_more(er.whitespace())) er.add(r"}") er.compile() return er
def __init__(self): """\ Constructor (pre-compile all needed regexes). """ # compile regexes self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) self._contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )", flags=UNICODE | IGNORECASE) self._dash_fixes = Regex( r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ", flags=UNICODE | IGNORECASE) self._dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ", flags=UNICODE | IGNORECASE)
def __init__(self, numH, strings): self.hSpace_ = list() self.strings_ = strings self.baseH_ = Regex(strings) self.baseHProb_ = self.likelihood(self.baseH_) self.numH_ = numH self.addRegexes([(self.baseH_.copy(), self.baseHProb_)])
def __init__(self): self.__author__ = "Revo" self.__date__ = "2017-10-27" # email address: self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' ) # Numbers self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)') # Replace with add one self.__addone = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)') # double space to single self.__spaces = Regex(r'\s+', flags=UNICODE) self.line = 0
def __init__(self, is_training=False): self.classifier = None self.feature_model = None self.regex_rule = Regex() if not is_training: self.classifier = utils.load( os.path.join('vnspliter/model', 'model.pkl')) if self.classifier is None: print "Unable to load model!" exit(-1)
def match_env(env='section', get_content=True): '''matchs text in the title or captions, \section{Chapter one}''' er = Regex() er.add(r"\\%s{" % env) if get_content: er.add(er.group_begin(name="content")) er.add(er.non_greedy(er.zero_or_more(er.anything()))) if get_content: er.add(er.group_end()) er.add(r"}") er.compile() return er
def regex_to_fa(self): regex_str = self.regex_input.text() try: self.fa = Regex(regex_str).dfa except SyntaxError as e: self.show_error(e) return self.fa.regex_str = regex_str self.add_fa_to_list()
def __init__(self, codes, separator='@@', vocab=None, glossaries=None): # check version information #codes = codecs.open(codes,"r", encoding='utf-8') firstline = codes.readline() if firstline.startswith('#version:'): self.version = tuple([ int(x) for x in re.sub(r'(\.0+)*$', '', firstline.split()[-1]).split(".") ]) else: self.version = (0, 1) codes.seek(0) self.bpe_codes = [tuple(item.split()) for item in codes] # some hacking to deal with duplicates (only consider first instance) self.bpe_codes = dict([ (code, i) for (i, code) in reversed(list(enumerate(self.bpe_codes))) ]) self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair, i in self.bpe_codes.items()]) self.separator = separator self.vocab = vocab #self.glossaries = glossaries if glossaries else [] self.glossaries = [] # for i in xrange(30): # self.glossaries.append("__URL"+str(i)+"__") # #self.glossaries.append("__NUM"+str(i)+"__") # self.glossaries.append("__EMAIL"+str(i)+"__") # self.cache = {} # added by revo self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' )
def save_regex(self): regex = Regex(self.regex_input.text()) path, _ = QFileDialog.getSaveFileName(self) if path: file = open(path, 'w') file.write(regex.regex_str) file.close() else: return
def to_regex(self): """ Returns a regex approximation Args: None Returns: str: A regex approximation """ from regex import Regex converter = Regex(self) return converter.get_regex()
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # load no-break prefixes for the given language self.__load_nobreaks(options.get('language'), options.get('nobreak_file')) # compile regexes self.__spaces = Regex(r'\s+') self.__space_at_end = Regex(r'(^|\n) ') self.__space_at_begin = Regex(r' ($|\n)') self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER) self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' + self.SENT_STARTER) self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT) self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT + r')? *$') self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$') self.__numbers = Regex(r'^\p{N}') self.__sent_starter = Regex(self.SENT_STARTER)
def __init__(self, alpha, path): self.alpha = alpha self.rules = [] with open(path, 'r') as f: for l in f.readlines(): l = l.strip() if len(l) == 0: continue l = l.split('=>') rx = Regex(l[0].strip(), self.alpha) tag = l[1].strip() self.rules.append([rx, tag])
def add_starred_from_converters(self, _from1, _to1, functional_object, converters): other_things = [(f, t, functional_object2) for f, t, functional_object2 in converters] for _from2, _to2, functional_object2 in flatten_optional_list_triple(other_things): if "*" in _to2: other_things_regex = Regex("^" + _from2.replace("*", r"(\w+)") + "$") m = other_things_regex.match(_to1) if m: new_to = _to2.replace("*", m.group(1)) new_from = _to2.replace("*", m.group(1)) self.add_edge(_to1, new_from, functional_object2)
def open_regex(self): path, _ = QFileDialog.getOpenFileName(self) string = "" if path: file = open(path, 'r') string = file.read() try: regex = Regex(string) self.regex_input.setText(regex.regex_str) except SyntaxError as e: self.show_error(e) return file.close()
def setUp(self): print('Running ' + self._testMethodName) # árvore de strings para teste self.tree = Node('.') self.tree.left = Node('l') self.tree.right = Node('r') self.tree.right.left = Node('a') self.tree.right.right = Node('b') self.tree.left.left = Node('c') # 'ab' apenas self.ab = Regex('') self.ab.root = Node('.') self.ab.root.left = Node('a') self.ab.root.right = Node('b') self.ab.thread() # (ab | ac)* a self.abaca = Regex('') n = Node('|') n.left = Node('.') n.right = Node('.') n.left.left = Node('a') n.left.right = Node('b') n.right.left = Node('a') n.right.right = Node('c') r = Node('.') r.left = Node('+') r.right = Node('a') r.left.left = n self.abaca.root = r self.abaca.thread()
def tgen_postprocess(text): currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )", flags=UNICODE | IGNORECASE) dash_fixes = Regex( r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ", flags=UNICODE | IGNORECASE) dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ", flags=UNICODE | IGNORECASE) text = ' ' + text + ' ' text = dash_fixes.sub(r' \1-\2 ', text) text = dash_fixes2.sub(r' \1-\2 ', text) text = currency_or_init_punct.sub(r' \1', text) text = noprespace_punct.sub(r'\1 ', text) text = contract.sub(r" \1'\2", text) text = text.strip() # capitalize if not text: return '' text = text[0].upper() + text[1:] return text
def __init__(self, db, app): self.wnioski = Wnioski(db) self.db = db self.app = app self.regex = Regex() if self.db.session.query(TassDB).all() == []: print('baza pusta, wczytuje dane') self.inicjuj_baze() print('dane wczytane') print('wyciągam lokalizacje') self._czysc_lokalizacje() self.regexuj_lokalizacje() print('baza danych gotowa') else: print('baza została już wcześniej utworzona') print('aby ją wczytać ponownie usun plik bazy serwer/TASS.db')
def add_starred(self, _from1, _to1, functional_object, converters): if _from1 == None: _from1 = OUT_OF_THE_BOX if "*" in _from1: other_things = [(f, t) for f, t, o in converters] new_things_regex = Regex("^" + _from1.replace("*", r"(\w+)") + "$") for _from2, _to2 in flatten_optional_list_pair(other_things): m = new_things_regex.match(_to2) if m: new_from = _to1.replace("*", m.group(1)) self.add_edge(_to2, new_from, functional_object) self.add_starred_from_converters(_to2, new_from, functional_object, converters)
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.moses_deescape = True if options.get('moses_deescape') else False self.language = options['language'] #print "WTF,",self.language self.capitalize_sents = True if options.get( 'capitalize_sents') else False # compile regexes # shuffix_space self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$') # prefix_space self.__noprespace_punct = Regex( r'^[\/\<\>\,\,\、\。\:\;\.\?\!\:\;\\\%\}\]\)\‰]+$') self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' + r'\uFF65-\uFFDC]') self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$') # language-specific regexes self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$') self.__contract = None # liangss add chinese numberic unit nospace process self.__nospace_chinese_numberic_unit = Regex(r'\d+[mMgGbB\%]*') self.special_chinese_symbol = Regex(r'[\,\%\‰]') # liangss add English date comma process self.__add_english_date_comma = Regex( r'\d+\s+[January|February|March|April|May|June|July|August|September|October|November|December]+') # liangss chinese character detokenize #self.__noprespace_punct_chinese = Regex(r'^[\,\。\?\!\\\%\\]\)]+$') if self.language in self.CONTRACTIONS: self.__contract = Regex(self.CONTRACTIONS[self.language], IGNORECASE)
def replacement(line, symbol=".", repl="。"): line = line.replace(". . .", ".") line = line.replace(u".", ".") # line = line.replace("...", ".") dot = Regex(r'(\S\s*)\%s(\s*\S*)' % symbol) m = dot.findall(line) if m: # print "BEFORE:", line # print m for ele in m: b_char = ele[0].strip() a_char = ele[1].strip() # consecutive dot avoid if symbol != b_char and symbol != a_char: # both are digit or are letters if is_ascii(b_char) and is_ascii(a_char): return line line = line.replace(ele[0] + symbol + ele[1], ele[0] + repl + ele[1]) # debug # if symbol == ',': # print m # print "AFTER:", line return line
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.moses_deescape = True if options.get('moses_deescape') else False self.language = options.get('language', 'english') self.is_capitalize = True if options.get('is_capitalize') else False self.is_true_case = True if options.get('is_true_case') else False #If the sentence is to be capitalized try loading the model if self.is_true_case: # get the models folder self.models_dir = options.get('models_dir', '.') # create the model file name model_file_name = self.models_dir + "/" + self.language + ".obj" # check that the model file exists if os.path.isfile(model_file_name): #Read the model file f = open(model_file_name, 'rb') self.uniDist = cPickle.load(f) self.backwardBiDist = cPickle.load(f) self.forwardBiDist = cPickle.load(f) self.trigramDist = cPickle.load(f) self.wordCasingLookup = cPickle.load(f) f.close() else: print "Unable to find the truecaser model for: ", self.language exit(1) # compile regexes self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$') self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$') self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' + r'\uFF65-\uFFDC]') self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$') # language-specific regexes self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$') self.__contract = None if self.language in self.CONTRACTIONS: self.__contract = Regex(self.CONTRACTIONS[self.language], IGNORECASE)
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.moses_deescape = True if options.get('moses_deescape') else False self.language = options.get('language', 'en') self.capitalize_sents = True if options.get('capitalize_sents') else False # compile regexes self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$') self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$') self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' + r'\uFF65-\uFFDC]') self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$') # language-specific regexes self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$') self.__contract = None if self.language in self.CONTRACTIONS: self.__contract = Regex(self.CONTRACTIONS[self.language], IGNORECASE)
def __init__(self): self.__author__ = "Revo" self.__date__ = "2017-12-28" #self.__date__ = "2017-10-24" # email address: self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' ) #self.__date_list = ["a.m","p.m","A.M","P.M"] # Numbers self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)') # Replace with add one self.__addone = Regex(r'(__(NUM|EMAIL|URL)__)') self.__addone_search = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)') # double space to single self.__spaces = Regex(r'\s+', flags=UNICODE) # self.__counter = dict({"URL": 0, "EMAIL": 0}) # self.line = 0
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.lowercase = True if options.get('lowercase') else False self.vw_escape = True if options.get('vw_escape') else False # compile regexes self.__spaces = Regex(r'\s+', flags=UNICODE) self.__ascii_junk = Regex(r'[\000-\037]') self.__special_chars = \ Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)') # single quotes: all unicode quotes + prime self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]') # double quotes: all unicode chars incl. Chinese + double prime + ditto self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])') self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])') self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])') self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])') # hyphen: separate every time but for unary minus self.__minus = Regex(r'([-−])') self.__pre_notnum = Regex(r'(-)([^\p{N}])') self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
def test_regex(self): symbol = Regex().start("java") self.assertEqual("/\*", str(symbol))
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.lowercase = True if options.get('lowercase') else False self.moses_escape = True if options.get('moses_escape') else False self.ts = options.get('num_t') if options.get('num_t') else 1 # compile regexes self.__spaces = Regex(r'\s+', flags=UNICODE) self.__ascii_junk = Regex(r'[\000-\037]') self.__special_chars = \ Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)') # email address: self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?://[a-zA-Z0-9:/\.?=!@$#&\*_()]+|www\.\w+\.[a-zA-Z0-9:/\.?=!@$#&\*_()]+|\w+\.\w+)' ) # NEED TO PROTECT THIS EMAIL ADDRESS, EXTRACT IT AND TEHN INSERT BACK # single quotes: all unicode quotes + prime self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]') # double quotes: all unicode chars incl. Chinese + double prime + ditto self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])') self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])') self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])') self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])') # hyphen: separate every time but for unary minus self.__minus = Regex(r'([-−])') self.__pre_notnum = Regex(r'(-)([^\p{N}])') self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
def detokenize_line(line): """ Detokenize the given text. adapted from: https://github.com/ufal/mtmonkey/blob/master/worker/src/util/detokenize.py Parameters: line: the line of text to detokenize Returns: str: the detokenized text """ # split text words = line.split(' ') # paste text back, omitting spaces where needed text = '' pre_spc = ' ' quote_count = {'\'': 0, '"': 0, '`': 0} capitalize_next = True text_len_last_final_punct = 0 for pos, word in enumerate(words): # no space after currency and initial punctuation if Regex(CURRENCY_OR_INIT_PUNCT).match(word): text += pre_spc + word pre_spc = '' # no space before commas etc. (exclude some punctuation for French) elif Regex(NOPRESPACE_PUNCT).match(word): text += word pre_spc = ' ' # contractions with comma or hyphen elif word in "'-–" and pos > 0 and pos < len(words) - 1 \ and Regex(CONTRACTIONS).match(''.join(words[pos - 1:pos + 2])): text += word pre_spc = '' # handle quoting elif word in '\'"„“”‚‘’`': # detect opening and closing quotes by counting # the appropriate quote types quote_type = word if quote_type in '„“”': quote_type = '"' elif quote_type in '‚‘’': quote_type = '\'' # special case: possessives in English ("Jones'" etc.) if text.endswith('s'): text += word pre_spc = ' ' # really a quotation mark else: # opening quote if quote_count[quote_type] % 2 == 0: text += pre_spc + word pre_spc = '' # closing quote else: text += word pre_spc = ' ' quote_count[quote_type] += 1 # contractions where comma or hyphen is already joined to following letters elif word[0] in "'-–" and pos > 0 and pos < len(words) - 1 \ and Regex(CONTRACTIONS).match(''.join(words[pos - 1:pos + 1])): text += word pre_spc = ' ' elif word == "n't": text += word pre_spc = ' ' # keep spaces around normal words else: if capitalize_next: capitalize_next = False if len(word) == 1: word = word.upper() else: word = word[0].upper() + word[1:] if word == 'i': word = word.upper() text += pre_spc + word pre_spc = ' ' if Regex(FINAL_PUNCT).match(word) and (text_len_last_final_punct == 0): capitalize_next = True text_len_last_final_punct = len(text) # strip leading/trailing space text = text.strip() text = text[:text_len_last_final_punct] return text
# -*- encoding=utf-8 -*- import codecs import sys from regex import Regex input = codecs.open(sys.argv[1], 'r', encoding='utf-8') end_symbols = ["。", "?"] quotes_symbols = ["“", "”"] hyps_re = Regex(r'^[—>]+') num = 0 for line in input: line = line.strip() # too damn short if len(line) < 3: continue # quotes are not match last_quote_count = line.count(quotes_symbols[1]) if line.count(quotes_symbols[0]) != last_quote_count and line[0] != quotes_symbols[0]: continue if line[0] == quotes_symbols[0] and last_quote_count == 0: line = line[1:] # first hyps line = hyps_re.sub('', line) # replcae by hard code term line = line.replace("(来源: )", "") # final strip line = line.strip() print(line) num += 1
def __init__(self, content, extension): self.content = content self.symbol = Regex().get(extension)