Пример #1
0
    def __init__(self, pattern, gaps=False, discard_empty=True,
                 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
        """
        Construct a new tokenizer that splits strings using the given
        regular expression C{pattern}.  By default, C{pattern} will be
        used to find tokens; but if C{gaps} is set to C{False}, then
        C{patterns} will be used to find separators between tokens
        instead.

        @type pattern: C{str}
        @param pattern: The pattern used to build this tokenizer.
            This pattern may safely contain grouping parenthases.
        @type gaps: C{bool}
        @param gaps: True if this tokenizer's pattern should be used
            to find separators between tokens; False if this
            tokenizer's pattern should be used to find the tokens
            themselves.
        @type discard_empty: C{bool}
        @param discard_empty: True if any empty tokens (C{''})
            generated by the tokenizer should be discarded.  Empty
            tokens can only be generated if L{_gaps} is true.
        @type flags: C{int}
        @param flags: The regexp flags used to compile this
            tokenizer's pattern.  By default, the following flags are
            used: C{re.UNICODE | re.MULTILINE | re.DOTALL}.
        """
        # If they gave us a regexp object, extract the pattern.
        pattern = getattr(pattern, 'pattern', pattern)
        
        self._pattern = pattern
        """The pattern used to build this tokenizer."""
        
        self._gaps = gaps
        """True if this tokenizer's pattern should be used to find
        separators between tokens; False if this tokenizer's pattern
        should be used to find the tokens themselves."""

        self._discard_empty = discard_empty
        """True if any empty tokens (C{''}) generated by the tokenizer
        should be discarded.  Empty tokens can only be generated if
        L{_gaps} is true."""

        self._flags = flags
        """The flags used to compile this tokenizer's pattern."""
        
        self._regexp = None
        """The compiled regular expression used to tokenize texts."""
        
        # Remove grouping parentheses -- if the regexp contains any
        # grouping parentheses, then the behavior of re.findall and
        # re.split will change.
        nongrouping_pattern = convert_regexp_to_nongrouping(pattern)

        try: 
            self._regexp = re.compile(nongrouping_pattern, flags)
        except re.error, e:
            raise ValueError('Error in regular expression %r: %s' %
                             (pattern, e))
Пример #2
0
    def __init__(self, pattern, gaps=False, discard_empty=True,
                 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
        """
        Construct a new tokenizer that splits strings using the given
        regular expression C{pattern}.  By default, C{pattern} will be
        used to find tokens; but if C{gaps} is set to C{False}, then
        C{patterns} will be used to find separators between tokens
        instead.

        @type pattern: C{str}
        @param pattern: The pattern used to build this tokenizer.
            This pattern may safely contain grouping parenthases.
        @type gaps: C{bool}
        @param gaps: True if this tokenizer's pattern should be used
            to find separators between tokens; False if this
            tokenizer's pattern should be used to find the tokens
            themselves.
        @type discard_empty: C{bool}
        @param discard_empty: True if any empty tokens (C{''})
            generated by the tokenizer should be discarded.  Empty
            tokens can only be generated if L{_gaps} is true.
        @type flags: C{int}
        @param flags: The regexp flags used to compile this
            tokenizer's pattern.  By default, the following flags are
            used: C{re.UNICODE | re.MULTILINE | re.DOTALL}.
        """
        # If they gave us a regexp object, extract the pattern.
        pattern = getattr(pattern, 'pattern', pattern)
        
        self._pattern = pattern
        """The pattern used to build this tokenizer."""
        
        self._gaps = gaps
        """True if this tokenizer's pattern should be used to find
        separators between tokens; False if this tokenizer's pattern
        should be used to find the tokens themselves."""

        self._discard_empty = discard_empty
        """True if any empty tokens (C{''}) generated by the tokenizer
        should be discarded.  Empty tokens can only be generated if
        L{_gaps} is true."""

        self._flags = flags
        """The flags used to compile this tokenizer's pattern."""
        
        self._regexp = None
        """The compiled regular expression used to tokenize texts."""
        
        # Remove grouping parentheses -- if the regexp contains any
        # grouping parentheses, then the behavior of re.findall and
        # re.split will change.
        nongrouping_pattern = convert_regexp_to_nongrouping(pattern)

        try: 
            self._regexp = re.compile(nongrouping_pattern, flags)
        except re.error, e:
            raise ValueError('Error in regular expression %r: %s' %
                             (pattern, e))
Пример #3
0
    def __init__(self, pattern, gaps=False, flags=regex.UNICODE | regex.MULTILINE | regex.DOTALL):
        # If they gave us a regexp object, extract the pattern.
        pattern = getattr(pattern, 'pattern', pattern)

        self._pattern = pattern
        self._gaps = gaps
        self._flags = flags
        self._regexp = None

        # Remove grouping parentheses -- if the regexp contains any
        # grouping parentheses, then the behavior of re.findall and
        # re.split will change.
        nongrouping_pattern = convert_regexp_to_nongrouping(pattern)

        try:
            self._regexp = regex.compile(nongrouping_pattern, flags)
        except regex.error, e:
            raise ValueError('Error in regular expression {}: {}'.format(pattern, e))
Пример #4
0
    def __init__(self, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL):
        # If they gave us a regexp object, extract the pattern.
        pattern = getattr(pattern, "pattern", pattern)

        self._pattern = pattern
        self._gaps = gaps
        self._discard_empty = discard_empty
        self._flags = flags
        self._regexp = None

        # Remove grouping parentheses -- if the regexp contains any
        # grouping parentheses, then the behavior of re.findall and
        # re.split will change.
        nongrouping_pattern = convert_regexp_to_nongrouping(pattern)

        try:
            self._regexp = re.compile(nongrouping_pattern, flags)
        except re.error as e:
            raise ValueError("Error in regular expression %r: %s" % (pattern, e))
Пример #5
0
    def __init__(self, pattern, gaps=False, discard_empty=True,
                 flags=re.UNICODE | re.MULTILINE | re.DOTALL):
        # If they gave us a regexp object, extract the pattern.
        pattern = getattr(pattern, 'pattern', pattern)

        self._pattern = pattern
        self._gaps = gaps
        self._discard_empty = discard_empty
        self._flags = flags
        self._regexp = None

        # Remove grouping parentheses -- if the regexp contains any
        # grouping parentheses, then the behavior of re.findall and
        # re.split will change.
        nongrouping_pattern = convert_regexp_to_nongrouping(pattern)

        try:
            self._regexp = re.compile(nongrouping_pattern, flags)
        except re.error as e:
            raise ValueError('Error in regular expression %r: %s' %
                             (pattern, e))
Пример #6
0
    twokenizer = TwitterTokenizer()

    print twokenizer(
        '#yolo this is a @louistiao SOMETHING test :) text http://example.com/test/foo_123.jpg'
    )
    print twokenizer(
        'big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-'
    )

    from nltk.internals import convert_regexp_to_nongrouping

    print REGEXEN['valid_url'].pattern.encode('utf-8')

    print re.compile(
        convert_regexp_to_nongrouping(REGEXEN['valid_url'].pattern)
    ).findall(
        'big http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-'
    )

    print convert_regexp_to_nongrouping(
        REGEXEN['valid_url'].pattern).encode('utf-8')

    print REGEXEN['valid_tco_url'].pattern.encode('utf-8')

    exit(0)
    for tweet in db_tweets.find({
            u'text': {
                '$exists': True,
                # '$regex': ':\)'
            }
Пример #7
0
CLEANUP_SUBS = list((re.compile(regexp), repl)
                    for regexp, repl in CLEANUP_SUBS)

TOKENIZER_PATTERNS = \
    r'''
    "[^"]+" %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators
    "[^"]+"  |  # word in brackets

    '[^']+' %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators
    '[^']+'  |  # word in brackets

    %(WORD)s %(KWS_OPERATORS)s %(WORD)s | # word op word
    %(WORD)s | # word
    \S+" # any other non-whitespace sequence
    ''' % locals()
TOKENIZER_PATTERNS = convert_regexp_to_nongrouping(TOKENIZER_PATTERNS)
TOKENIZER_PATTERNS = re.compile(TOKENIZER_PATTERNS, re.VERBOSE)


def cleanup_query(query):
    """
    Returns cleaned query by applying a number of transformation patterns
    that removes spaces and simplifies the conditions

    >>> cleanup_query('number of events = 33')
    'number of events=33'

    >>> cleanup_query('number of events >    33')
    'number of events>33'

    >>> cleanup_query('more than 33 events')
Пример #8
0
db = client.twitter_database
db_labeled_tweets = db.labeled_tweets

twts = db_labeled_tweets.find({u'text': re.compile(r'\d+')})

for twt in twts:
    print twt[u'text']
    print decode_html_entities(twt[u'text'])
    print

print decode_html_entities('⾳')

exit(0)


pattern_re = re.compile(convert_regexp_to_nongrouping(pattern), re.IGNORECASE | re.VERBOSE)

print test
print pattern_re.findall(test)
print pattern_re.sub('{URL}', test)

exit(0)

import twitter_text

extractor = twitter_text.extractor.Extractor(test)

print extractor.extract_urls()

class Replacer(object):
Пример #9
0
 def __init__(self, pos_tagging=False):
     self.pos_tagging = pos_tagging
     pattern = ur'|'.join(REGEXES[k]['regex'].pattern for k in ('url', 'emoticons', 'mention', 'hashtag', 'words'))
     nongrouping_pattern = convert_regexp_to_nongrouping(pattern)
     self._regexp = re.compile(nongrouping_pattern, flags=re.UNICODE | re.MULTILINE | re.VERBOSE | re.IGNORECASE)
Пример #10
0
CLEANUP_SUBS = list(
    (re.compile(regexp), repl) for regexp, repl in CLEANUP_SUBS)

TOKENIZER_PATTERNS = \
    r'''
    "[^"]+" %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators
    "[^"]+"  |  # word in brackets

    '[^']+' %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators
    '[^']+'  |  # word in brackets

    %(WORD)s %(KWS_OPERATORS)s %(WORD)s | # word op word
    %(WORD)s | # word
    \S+" # any other non-whitespace sequence
    ''' % locals()
TOKENIZER_PATTERNS = convert_regexp_to_nongrouping(TOKENIZER_PATTERNS)
TOKENIZER_PATTERNS = re.compile(TOKENIZER_PATTERNS, re.VERBOSE)


def cleanup_query(query):
    """
    Returns cleaned query by applying a number of transformation patterns
    that removes spaces and simplifies the conditions

    >>> cleanup_query('number of events = 33')
    'number of events=33'

    >>> cleanup_query('number of events >    33')
    'number of events>33'

    >>> cleanup_query('more than 33 events')
Пример #11
0
    
    from pymongo import MongoClient
    client = MongoClient()
    db = client.twitter_database
    db_tweets = db.tweets

    twokenizer = TwitterTokenizer()

    print twokenizer('#yolo this is a @louistiao SOMETHING test :) text http://example.com/test/foo_123.jpg')
    print twokenizer('big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-')

    from nltk.internals import convert_regexp_to_nongrouping

    print REGEXEN['valid_url'].pattern.encode('utf-8')

    print re.compile(convert_regexp_to_nongrouping(REGEXEN['valid_url'].pattern)).findall('big http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-')

    print convert_regexp_to_nongrouping(REGEXEN['valid_url'].pattern).encode('utf-8')

    print REGEXEN['valid_tco_url'].pattern.encode('utf-8')

    exit(0)
    for tweet in db_tweets.find(
            {
                u'text': 
                {
                    '$exists': True,
                    # '$regex': ':\)'
                }
            }
        ):