def __init__(self, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL): """ Construct a new tokenizer that splits strings using the given regular expression C{pattern}. By default, C{pattern} will be used to find tokens; but if C{gaps} is set to C{False}, then C{patterns} will be used to find separators between tokens instead. @type pattern: C{str} @param pattern: The pattern used to build this tokenizer. This pattern may safely contain grouping parenthases. @type gaps: C{bool} @param gaps: True if this tokenizer's pattern should be used to find separators between tokens; False if this tokenizer's pattern should be used to find the tokens themselves. @type discard_empty: C{bool} @param discard_empty: True if any empty tokens (C{''}) generated by the tokenizer should be discarded. Empty tokens can only be generated if L{_gaps} is true. @type flags: C{int} @param flags: The regexp flags used to compile this tokenizer's pattern. By default, the following flags are used: C{re.UNICODE | re.MULTILINE | re.DOTALL}. """ # If they gave us a regexp object, extract the pattern. pattern = getattr(pattern, 'pattern', pattern) self._pattern = pattern """The pattern used to build this tokenizer.""" self._gaps = gaps """True if this tokenizer's pattern should be used to find separators between tokens; False if this tokenizer's pattern should be used to find the tokens themselves.""" self._discard_empty = discard_empty """True if any empty tokens (C{''}) generated by the tokenizer should be discarded. Empty tokens can only be generated if L{_gaps} is true.""" self._flags = flags """The flags used to compile this tokenizer's pattern.""" self._regexp = None """The compiled regular expression used to tokenize texts.""" # Remove grouping parentheses -- if the regexp contains any # grouping parentheses, then the behavior of re.findall and # re.split will change. nongrouping_pattern = convert_regexp_to_nongrouping(pattern) try: self._regexp = re.compile(nongrouping_pattern, flags) except re.error, e: raise ValueError('Error in regular expression %r: %s' % (pattern, e))
def __init__(self, pattern, gaps=False, flags=regex.UNICODE | regex.MULTILINE | regex.DOTALL): # If they gave us a regexp object, extract the pattern. pattern = getattr(pattern, 'pattern', pattern) self._pattern = pattern self._gaps = gaps self._flags = flags self._regexp = None # Remove grouping parentheses -- if the regexp contains any # grouping parentheses, then the behavior of re.findall and # re.split will change. nongrouping_pattern = convert_regexp_to_nongrouping(pattern) try: self._regexp = regex.compile(nongrouping_pattern, flags) except regex.error, e: raise ValueError('Error in regular expression {}: {}'.format(pattern, e))
def __init__(self, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL): # If they gave us a regexp object, extract the pattern. pattern = getattr(pattern, "pattern", pattern) self._pattern = pattern self._gaps = gaps self._discard_empty = discard_empty self._flags = flags self._regexp = None # Remove grouping parentheses -- if the regexp contains any # grouping parentheses, then the behavior of re.findall and # re.split will change. nongrouping_pattern = convert_regexp_to_nongrouping(pattern) try: self._regexp = re.compile(nongrouping_pattern, flags) except re.error as e: raise ValueError("Error in regular expression %r: %s" % (pattern, e))
def __init__(self, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL): # If they gave us a regexp object, extract the pattern. pattern = getattr(pattern, 'pattern', pattern) self._pattern = pattern self._gaps = gaps self._discard_empty = discard_empty self._flags = flags self._regexp = None # Remove grouping parentheses -- if the regexp contains any # grouping parentheses, then the behavior of re.findall and # re.split will change. nongrouping_pattern = convert_regexp_to_nongrouping(pattern) try: self._regexp = re.compile(nongrouping_pattern, flags) except re.error as e: raise ValueError('Error in regular expression %r: %s' % (pattern, e))
twokenizer = TwitterTokenizer() print twokenizer( '#yolo this is a @louistiao SOMETHING test :) text http://example.com/test/foo_123.jpg' ) print twokenizer( 'big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-' ) from nltk.internals import convert_regexp_to_nongrouping print REGEXEN['valid_url'].pattern.encode('utf-8') print re.compile( convert_regexp_to_nongrouping(REGEXEN['valid_url'].pattern) ).findall( 'big http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-' ) print convert_regexp_to_nongrouping( REGEXEN['valid_url'].pattern).encode('utf-8') print REGEXEN['valid_tco_url'].pattern.encode('utf-8') exit(0) for tweet in db_tweets.find({ u'text': { '$exists': True, # '$regex': ':\)' }
CLEANUP_SUBS = list((re.compile(regexp), repl) for regexp, repl in CLEANUP_SUBS) TOKENIZER_PATTERNS = \ r''' "[^"]+" %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators "[^"]+" | # word in brackets '[^']+' %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators '[^']+' | # word in brackets %(WORD)s %(KWS_OPERATORS)s %(WORD)s | # word op word %(WORD)s | # word \S+" # any other non-whitespace sequence ''' % locals() TOKENIZER_PATTERNS = convert_regexp_to_nongrouping(TOKENIZER_PATTERNS) TOKENIZER_PATTERNS = re.compile(TOKENIZER_PATTERNS, re.VERBOSE) def cleanup_query(query): """ Returns cleaned query by applying a number of transformation patterns that removes spaces and simplifies the conditions >>> cleanup_query('number of events = 33') 'number of events=33' >>> cleanup_query('number of events > 33') 'number of events>33' >>> cleanup_query('more than 33 events')
db = client.twitter_database db_labeled_tweets = db.labeled_tweets twts = db_labeled_tweets.find({u'text': re.compile(r'\d+')}) for twt in twts: print twt[u'text'] print decode_html_entities(twt[u'text']) print print decode_html_entities('⾳') exit(0) pattern_re = re.compile(convert_regexp_to_nongrouping(pattern), re.IGNORECASE | re.VERBOSE) print test print pattern_re.findall(test) print pattern_re.sub('{URL}', test) exit(0) import twitter_text extractor = twitter_text.extractor.Extractor(test) print extractor.extract_urls() class Replacer(object):
def __init__(self, pos_tagging=False): self.pos_tagging = pos_tagging pattern = ur'|'.join(REGEXES[k]['regex'].pattern for k in ('url', 'emoticons', 'mention', 'hashtag', 'words')) nongrouping_pattern = convert_regexp_to_nongrouping(pattern) self._regexp = re.compile(nongrouping_pattern, flags=re.UNICODE | re.MULTILINE | re.VERBOSE | re.IGNORECASE)
CLEANUP_SUBS = list( (re.compile(regexp), repl) for regexp, repl in CLEANUP_SUBS) TOKENIZER_PATTERNS = \ r''' "[^"]+" %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators "[^"]+" | # word in brackets '[^']+' %(KWS_OPERATORS)s %(WORD)s | # word in brackets plus operators '[^']+' | # word in brackets %(WORD)s %(KWS_OPERATORS)s %(WORD)s | # word op word %(WORD)s | # word \S+" # any other non-whitespace sequence ''' % locals() TOKENIZER_PATTERNS = convert_regexp_to_nongrouping(TOKENIZER_PATTERNS) TOKENIZER_PATTERNS = re.compile(TOKENIZER_PATTERNS, re.VERBOSE) def cleanup_query(query): """ Returns cleaned query by applying a number of transformation patterns that removes spaces and simplifies the conditions >>> cleanup_query('number of events = 33') 'number of events=33' >>> cleanup_query('number of events > 33') 'number of events>33' >>> cleanup_query('more than 33 events')
from pymongo import MongoClient client = MongoClient() db = client.twitter_database db_tweets = db.tweets twokenizer = TwitterTokenizer() print twokenizer('#yolo this is a @louistiao SOMETHING test :) text http://example.com/test/foo_123.jpg') print twokenizer('big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-') from nltk.internals import convert_regexp_to_nongrouping print REGEXEN['valid_url'].pattern.encode('utf-8') print re.compile(convert_regexp_to_nongrouping(REGEXEN['valid_url'].pattern)).findall('big http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag http://t.co/FNkPfmii-') print convert_regexp_to_nongrouping(REGEXEN['valid_url'].pattern).encode('utf-8') print REGEXEN['valid_tco_url'].pattern.encode('utf-8') exit(0) for tweet in db_tweets.find( { u'text': { '$exists': True, # '$regex': ':\)' } } ):