示例#1
0
 def __init__(self):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # compile regexes
     self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE)
     self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE)
     self._contract = Regex(r" (\p{Alpha}+) ' ?(ll|ve|re|[dsmt])(?= )", flags=UNICODE|IGNORECASE)
     self._fixes = Regex(r" (do|go[nt]|wan) (n't|ta|na)(?= )", flags=UNICODE|IGNORECASE)
     self._replace_table = {' i ':' I ',
                            ' im ': ' I\'m ',
                            ' dont ': ' don\'t '}
示例#2
0
def match_begin_end_env(env='equation', get_content=True):
    '''matchs \begin{equation*} something ... \end{equation} 
    One special option is env='anything', will match all \begin{}*\end{}
    '''
    er = Regex()
    er.add(r"\\begin{")
    er.add(er.zero_or_more(er.whitespace()))
    if env == 'anything':
        env = er.non_greedy(er.zero_or_more(er.anything()))
    er.add(env)
    er.add(er.zero_or_more(er.whitespace()))
    er.add(er.zero_or_one(r'\*'))
    er.add(er.zero_or_more(er.whitespace()))
    er.add(r"}")
    if get_content: er.add(er.group_begin(name="content"))
    er.add(er.non_greedy(er.zero_or_more(er.anything())))
    if get_content: er.add(er.group_end())
    er.add(r"\\end{")
    er.add(er.zero_or_more(er.whitespace()))
    er.add(env)
    er.add(er.zero_or_more(er.whitespace()))
    er.add(er.zero_or_one(r'\*'))
    er.add(er.zero_or_more(er.whitespace()))
    er.add(r"}")
    er.compile()
    return er
示例#3
0
 def __init__(self):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # compile regexes
     self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ',
                                          flags=UNICODE)
     self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ',
                                    flags=UNICODE)
     self._contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )",
                            flags=UNICODE | IGNORECASE)
     self._dash_fixes = Regex(
         r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ",
         flags=UNICODE | IGNORECASE)
     self._dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ",
                               flags=UNICODE | IGNORECASE)
示例#4
0
 def __init__(self, numH, strings):
     self.hSpace_ = list()
     self.strings_ = strings
     self.baseH_ = Regex(strings)
     self.baseHProb_ = self.likelihood(self.baseH_)
     self.numH_ = numH
     self.addRegexes([(self.baseH_.copy(), self.baseHProb_)])
示例#5
0
    def __init__(self):
        self.__author__ = "Revo"
        self.__date__ = "2017-10-27"
        # email address:
        self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
        # url address:
        self.__url_addr = Regex(
            r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
        )
        # Numbers
        self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)')
        # Replace with add one
        self.__addone = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)')
        # double space to single
        self.__spaces = Regex(r'\s+', flags=UNICODE)

        self.line = 0
示例#6
0
 def __init__(self, is_training=False):
     self.classifier = None
     self.feature_model = None
     self.regex_rule = Regex()
     if not is_training:
         self.classifier = utils.load(
             os.path.join('vnspliter/model', 'model.pkl'))
         if self.classifier is None:
             print "Unable to load model!"
             exit(-1)
示例#7
0
def match_env(env='section', get_content=True):
    '''matchs text in the title or captions, \section{Chapter one}'''
    er = Regex()
    er.add(r"\\%s{" % env)
    if get_content: er.add(er.group_begin(name="content"))
    er.add(er.non_greedy(er.zero_or_more(er.anything())))
    if get_content: er.add(er.group_end())
    er.add(r"}")
    er.compile()
    return er
示例#8
0
    def regex_to_fa(self):
        regex_str = self.regex_input.text()
        try:
            self.fa = Regex(regex_str).dfa
        except SyntaxError as e:
            self.show_error(e)
            return

        self.fa.regex_str = regex_str
        self.add_fa_to_list()
示例#9
0
    def __init__(self, codes, separator='@@', vocab=None, glossaries=None):

        # check version information
        #codes = codecs.open(codes,"r", encoding='utf-8')
        firstline = codes.readline()
        if firstline.startswith('#version:'):
            self.version = tuple([
                int(x) for x in re.sub(r'(\.0+)*$', '',
                                       firstline.split()[-1]).split(".")
            ])
        else:
            self.version = (0, 1)
            codes.seek(0)

        self.bpe_codes = [tuple(item.split()) for item in codes]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([
            (code, i)
            for (i, code) in reversed(list(enumerate(self.bpe_codes)))
        ])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair)
                                       for pair, i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        #self.glossaries = glossaries if glossaries else []
        self.glossaries = []
        # for i in xrange(30):
        #     self.glossaries.append("__URL"+str(i)+"__")
        #     #self.glossaries.append("__NUM"+str(i)+"__")
        #     self.glossaries.append("__EMAIL"+str(i)+"__")
        #
        self.cache = {}
        # added by revo
        self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
        # url address:
        self.__url_addr = Regex(
            r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
        )
示例#10
0
    def save_regex(self):
        regex = Regex(self.regex_input.text())

        path, _ = QFileDialog.getSaveFileName(self)
        if path:
            file = open(path, 'w')
            file.write(regex.regex_str)
            file.close()
        else:
            return
示例#11
0
 def to_regex(self):
     """
     Returns a regex approximation
     Args:
         None
     Returns:
         str: A regex approximation
     """
     from regex import Regex
     converter = Regex(self)
     return converter.get_regex()
示例#12
0
 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # load no-break prefixes for the given language
     self.__load_nobreaks(options.get('language'),
                          options.get('nobreak_file'))
     # compile regexes
     self.__spaces = Regex(r'\s+')
     self.__space_at_end = Regex(r'(^|\n) ')
     self.__space_at_begin = Regex(r' ($|\n)')
     self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
     self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
                             self.SENT_STARTER)
     self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
     self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
                           r')? *$')
     self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
     self.__numbers = Regex(r'^\p{N}')
     self.__sent_starter = Regex(self.SENT_STARTER)
示例#13
0
    def __init__(self, alpha, path):
        self.alpha = alpha
        self.rules = []

        with open(path, 'r') as f:
            for l in f.readlines():
                l = l.strip()
                if len(l) == 0:
                    continue
                l = l.split('=>')
                rx = Regex(l[0].strip(), self.alpha)
                tag = l[1].strip()
                self.rules.append([rx, tag])
示例#14
0
    def add_starred_from_converters(self, _from1, _to1, functional_object, converters):
        other_things = [(f, t, functional_object2) for f, t, functional_object2 in converters]
        for _from2, _to2, functional_object2 in flatten_optional_list_triple(other_things):
            if "*" in _to2:

                other_things_regex = Regex("^" + _from2.replace("*", r"(\w+)") + "$")
                m = other_things_regex.match(_to1)

                if m:
                    new_to = _to2.replace("*", m.group(1))

                    new_from = _to2.replace("*", m.group(1))
                    self.add_edge(_to1, new_from, functional_object2)
示例#15
0
 def open_regex(self):
     path, _ = QFileDialog.getOpenFileName(self)
     string = ""
     if path:
         file = open(path, 'r')
         string = file.read()
         try:
             regex = Regex(string)
             self.regex_input.setText(regex.regex_str)
         except SyntaxError as e:
             self.show_error(e)
             return
         file.close()
示例#16
0
    def setUp(self):
        print('Running ' + self._testMethodName)

        # árvore de strings para teste
        self.tree = Node('.')
        self.tree.left = Node('l')
        self.tree.right = Node('r')
        self.tree.right.left = Node('a')
        self.tree.right.right = Node('b')
        self.tree.left.left = Node('c')

        # 'ab' apenas
        self.ab = Regex('')

        self.ab.root = Node('.')
        self.ab.root.left = Node('a')
        self.ab.root.right = Node('b')

        self.ab.thread()

        # (ab | ac)* a
        self.abaca = Regex('')

        n = Node('|')
        n.left = Node('.')
        n.right = Node('.')
        n.left.left = Node('a')
        n.left.right = Node('b')
        n.right.left = Node('a')
        n.right.right = Node('c')

        r = Node('.')
        r.left = Node('+')
        r.right = Node('a')
        r.left.left = n

        self.abaca.root = r
        self.abaca.thread()
def tgen_postprocess(text):
    currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE)
    noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE)
    contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )",
                     flags=UNICODE | IGNORECASE)
    dash_fixes = Regex(
        r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ",
        flags=UNICODE | IGNORECASE)
    dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ",
                        flags=UNICODE | IGNORECASE)

    text = ' ' + text + ' '
    text = dash_fixes.sub(r' \1-\2 ', text)
    text = dash_fixes2.sub(r' \1-\2 ', text)
    text = currency_or_init_punct.sub(r' \1', text)
    text = noprespace_punct.sub(r'\1 ', text)
    text = contract.sub(r" \1'\2", text)
    text = text.strip()
    # capitalize
    if not text:
        return ''
    text = text[0].upper() + text[1:]
    return text
示例#18
0
 def __init__(self, db, app):
     self.wnioski = Wnioski(db)
     self.db = db
     self.app = app
     self.regex = Regex()
     if self.db.session.query(TassDB).all() == []:
         print('baza pusta, wczytuje dane')
         self.inicjuj_baze()
         print('dane wczytane')
         print('wyciągam lokalizacje')
         self._czysc_lokalizacje()
         self.regexuj_lokalizacje()
         print('baza danych gotowa')
     else:
         print('baza została już wcześniej utworzona')
         print('aby ją wczytać ponownie usun plik bazy serwer/TASS.db')
示例#19
0
    def add_starred(self, _from1, _to1, functional_object, converters):

        if _from1 == None:
            _from1 = OUT_OF_THE_BOX

        if "*" in _from1:

            other_things = [(f, t) for f, t, o in converters]
            new_things_regex = Regex("^" + _from1.replace("*", r"(\w+)") + "$")

            for _from2, _to2 in flatten_optional_list_pair(other_things):
                m = new_things_regex.match(_to2)
                if m:
                    new_from = _to1.replace("*", m.group(1))
                    self.add_edge(_to2, new_from, functional_object)
                    self.add_starred_from_converters(_to2, new_from, functional_object, converters)
示例#20
0
    def __init__(self, options={}):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # process options
        self.moses_deescape = True if options.get('moses_deescape') else False
        self.language = options['language']
        #print "WTF,",self.language
        self.capitalize_sents = True if options.get(
            'capitalize_sents') else False
        # compile regexes
        # shuffix_space
        self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$')
        # prefix_space
        self.__noprespace_punct = Regex(
            r'^[\/\<\>\,\,\、\。\:\;\.\?\!\:\;\\\%\}\]\)\‰]+$')
        self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F'
                                 + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F'
                                 + r'\uFF65-\uFFDC]')
        self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$')

        # language-specific regexes
        self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$')
        self.__contract = None

        # liangss add chinese numberic unit  nospace process
        self.__nospace_chinese_numberic_unit = Regex(r'\d+[mMgGbB\%]*')
        self.special_chinese_symbol = Regex(r'[\,\%\‰]')

        # liangss add English date comma process
        self.__add_english_date_comma = Regex(
            r'\d+\s+[January|February|March|April|May|June|July|August|September|October|November|December]+')
        # liangss chinese character detokenize
        #self.__noprespace_punct_chinese = Regex(r'^[\,\。\?\!\\\%\\]\)]+$')
        if self.language in self.CONTRACTIONS:
            self.__contract = Regex(self.CONTRACTIONS[self.language],
                                    IGNORECASE)
示例#21
0
def replacement(line, symbol=".", repl="。"):
    line = line.replace(". . .", ".")
    line = line.replace(u".", ".")
    # line = line.replace("...", ".")
    dot = Regex(r'(\S\s*)\%s(\s*\S*)' % symbol)
    m = dot.findall(line)
    if m:
        # print "BEFORE:", line
        # print m
        for ele in m:
            b_char = ele[0].strip()
            a_char = ele[1].strip()
            # consecutive dot avoid
            if symbol != b_char and symbol != a_char:
                # both are digit or are letters
                if is_ascii(b_char) and is_ascii(a_char):
                    return line
                line = line.replace(ele[0] + symbol + ele[1],
                                    ele[0] + repl + ele[1])
                # debug
                # if symbol == ',':
                #     print m
                #     print "AFTER:", line
    return line
示例#22
0
 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # process options
     self.moses_deescape = True if options.get('moses_deescape') else False
     self.language = options.get('language', 'english')
     self.is_capitalize = True if options.get('is_capitalize') else False
     self.is_true_case = True if options.get('is_true_case') else False
     
     #If the sentence is to be capitalized try loading the model
     if self.is_true_case:
         # get the models folder
         self.models_dir = options.get('models_dir', '.')
         # create the model file name
         model_file_name = self.models_dir + "/" + self.language + ".obj"
         # check that the model file exists
         if os.path.isfile(model_file_name):
             #Read the model file
             f = open(model_file_name, 'rb')
             self.uniDist = cPickle.load(f)
             self.backwardBiDist = cPickle.load(f)
             self.forwardBiDist = cPickle.load(f)
             self.trigramDist = cPickle.load(f)
             self.wordCasingLookup = cPickle.load(f)
             f.close()
         else:
             print "Unable to find the truecaser model for: ", self.language
             exit(1)
     
     # compile regexes
     self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$')
     self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$')
     self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F'
                              + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F'
                              + r'\uFF65-\uFFDC]')
     self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$')
     # language-specific regexes
     self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$')
     self.__contract = None
     if self.language in self.CONTRACTIONS:
         self.__contract = Regex(self.CONTRACTIONS[self.language],
                                 IGNORECASE)
示例#23
0
 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # process options
     self.moses_deescape = True if options.get('moses_deescape') else False
     self.language = options.get('language', 'en')
     self.capitalize_sents = True if options.get('capitalize_sents') else False
     # compile regexes
     self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$')
     self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$')
     self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F'
                              + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F'
                              + r'\uFF65-\uFFDC]')
     self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$')
     # language-specific regexes
     self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$')
     self.__contract = None
     if self.language in self.CONTRACTIONS:
         self.__contract = Regex(self.CONTRACTIONS[self.language],
                                 IGNORECASE)
示例#24
0
 def __init__(self):
     self.__author__ = "Revo"
     self.__date__ = "2017-12-28"
     #self.__date__ = "2017-10-24"
     # email address:
     self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
     # url address:
     self.__url_addr = Regex(
         r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
     )
     #self.__date_list = ["a.m","p.m","A.M","P.M"]
     # Numbers
     self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)')
     # Replace with add one
     self.__addone = Regex(r'(__(NUM|EMAIL|URL)__)')
     self.__addone_search = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)')
     # double space to single
     self.__spaces = Regex(r'\s+', flags=UNICODE)
     #
     self.__counter = dict({"URL": 0, "EMAIL": 0})
     #
     self.line = 0
示例#25
0
 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # process options
     self.lowercase = True if options.get('lowercase') else False
     self.vw_escape = True if options.get('vw_escape') else False
     # compile regexes
     self.__spaces = Regex(r'\s+', flags=UNICODE)
     self.__ascii_junk = Regex(r'[\000-\037]')
     self.__special_chars = \
             Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
     # single quotes: all unicode quotes + prime
     self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
     # double quotes: all unicode chars incl. Chinese + double prime + ditto
     self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
     self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
     self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
     self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
     # hyphen: separate every time but for unary minus
     self.__minus = Regex(r'([-−])')
     self.__pre_notnum = Regex(r'(-)([^\p{N}])')
     self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
示例#26
0
 def test_regex(self):
     symbol = Regex().start("java")
     self.assertEqual("/\*", str(symbol))
示例#27
0
    def __init__(self, options={}):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # process options
        self.lowercase = True if options.get('lowercase') else False
        self.moses_escape = True if options.get('moses_escape') else False
        self.ts = options.get('num_t') if options.get('num_t') else 1
        # compile regexes
        self.__spaces = Regex(r'\s+', flags=UNICODE)
        self.__ascii_junk = Regex(r'[\000-\037]')
        self.__special_chars = \
            Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
        # email address:
        self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
        # url address:
        self.__url_addr = Regex(
            r'(?P<url>https?://[a-zA-Z0-9:/\.?=!@$#&\*_()]+|www\.\w+\.[a-zA-Z0-9:/\.?=!@$#&\*_()]+|\w+\.\w+)'
        )
        # NEED TO PROTECT THIS EMAIL ADDRESS, EXTRACT IT AND TEHN INSERT BACK

        # single quotes: all unicode quotes + prime
        self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
        # double quotes: all unicode chars incl. Chinese + double prime + ditto
        self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
        self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
        self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
        self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
        # hyphen: separate every time but for unary minus
        self.__minus = Regex(r'([-−])')
        self.__pre_notnum = Regex(r'(-)([^\p{N}])')
        self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
示例#28
0
def detokenize_line(line):
    """
    Detokenize the given text.
    adapted from:
    https://github.com/ufal/mtmonkey/blob/master/worker/src/util/detokenize.py

    Parameters:
    line: the line of text to detokenize

    Returns:
    str: the detokenized text
    """    
    # split text
    words = line.split(' ')
    # paste text back, omitting spaces where needed
    text = ''
    pre_spc = ' '
    quote_count = {'\'': 0, '"': 0, '`': 0}
    capitalize_next = True
    text_len_last_final_punct = 0
    for pos, word in enumerate(words):
        # no space after currency and initial punctuation
        if Regex(CURRENCY_OR_INIT_PUNCT).match(word):
            text += pre_spc + word
            pre_spc = ''
        # no space before commas etc. (exclude some punctuation for French)
        elif Regex(NOPRESPACE_PUNCT).match(word):
            text += word
            pre_spc = ' '
        # contractions with comma or hyphen
        elif word in "'-–" and pos > 0 and pos < len(words) - 1 \
                and Regex(CONTRACTIONS).match(''.join(words[pos - 1:pos + 2])):
            text += word
            pre_spc = ''
        # handle quoting
        elif word in '\'"„“”‚‘’`':
            # detect opening and closing quotes by counting
            # the appropriate quote types
            quote_type = word
            if quote_type in '„“”':
                quote_type = '"'
            elif quote_type in '‚‘’':
                quote_type = '\''
            # special case: possessives in English ("Jones'" etc.)
            if text.endswith('s'):
                text += word
                pre_spc = ' '
            # really a quotation mark
            else:
                # opening quote
                if quote_count[quote_type] % 2 == 0:
                    text += pre_spc + word
                    pre_spc = ''
                # closing quote
                else:
                    text += word
                    pre_spc = ' '
                quote_count[quote_type] += 1
        # contractions where comma or hyphen is already joined to following letters
        elif word[0] in "'-–" and pos > 0 and pos < len(words) - 1 \
                and Regex(CONTRACTIONS).match(''.join(words[pos - 1:pos + 1])):
            text += word
            pre_spc = ' '
        elif word == "n't":
            text += word
            pre_spc = ' '
        # keep spaces around normal words
        else:
            if capitalize_next:
                capitalize_next = False
                if len(word) == 1:
                    word = word.upper()
                else:
                    word = word[0].upper() + word[1:]
            if word == 'i':
                word = word.upper()
            text += pre_spc + word
            pre_spc = ' '
        if Regex(FINAL_PUNCT).match(word) and (text_len_last_final_punct == 0):
            capitalize_next = True
            text_len_last_final_punct = len(text)
    # strip leading/trailing space
    text = text.strip()
    text = text[:text_len_last_final_punct]
    return text
示例#29
0
# -*- encoding=utf-8 -*-
import codecs
import sys
from regex import Regex

input = codecs.open(sys.argv[1], 'r', encoding='utf-8')

end_symbols = ["。", "?"]
quotes_symbols = ["“", "”"]
hyps_re = Regex(r'^[—>]+')

num = 0
for line in input:
    line = line.strip()
    # too damn short
    if len(line) < 3:
        continue
    # quotes are not match
    last_quote_count = line.count(quotes_symbols[1])
    if line.count(quotes_symbols[0]) != last_quote_count and line[0] != quotes_symbols[0]:
        continue
    if line[0] == quotes_symbols[0] and last_quote_count == 0:
        line = line[1:]
    # first hyps
    line = hyps_re.sub('', line)
    # replcae by hard code term
    line = line.replace("(来源: )", "")
    # final strip
    line = line.strip()
    print(line)
    num += 1
示例#30
0
 def __init__(self, content, extension):
     self.content = content
     self.symbol = Regex().get(extension)