예제 #1
0
    def __init__(self, num_tweets=False, debug=False, time=False):

        self.collection = db.twictionary_models_tweets
        self.tres = db.twictionary_models_tres

        opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False)
        if num_tweets:
            opts['limit'] = num_tweets

        if debug or time:
            opts['limit'] = 5000

        self.tweets = self.collection.find(**opts)
        self.persister = Persister(debug=debug)
        self.debug = debug
        self.time = time
        self.tokenizer = TwitterTokenizer()

        self.run()
예제 #2
0
    def __init__(self, num_tweets=False, debug=False, time=False):

        self.collection = db.twictionary_models_tweets
        self.tres = db.twictionary_models_tres

        opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False)
        if num_tweets:
            opts['limit'] = num_tweets

        if debug or time:
            opts['limit'] = 5000

        self.tweets = self.collection.find(**opts)
        self.persister = Persister(debug=debug)
        self.debug = debug
        self.time = time
        self.tokenizer = TwitterTokenizer()

        self.run()
예제 #3
0
class Pairer(object):

    def __init__(self, num_tweets=False, debug=False, time=False):

        self.collection = db.twictionary_models_tweets
        self.tres = db.twictionary_models_tres

        opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False)
        if num_tweets:
            opts['limit'] = num_tweets

        if debug or time:
            opts['limit'] = 5000

        self.tweets = self.collection.find(**opts)
        self.persister = Persister(debug=debug)
        self.debug = debug
        self.time = time
        self.tokenizer = TwitterTokenizer()

        self.run()

    def run(self):
        total = 0
        start_time = time.time()
        end_time = time.time()
        if self.time:
            start_time = time.time()

        for t in self.tweets:
            if total % 100000 == 0:
                end_time = time.time()
                print "processed %s documents!" % total
                print "these documents took %s seconds to process" % (end_time - start_time)
                start_time = time.time()

            self.pair(t)
            total += 1

        if self.time:
            end_time = time.time()
            print "Run time: %s" % (end_time - start_time)
            self.tres.remove({})

    def pair(self, tweet):
        text = tweet['text']
        id = tweet['_id']
        tokenized = self.tokenizer.tokenize(text)
        length = len(tokenized)
        for i in range(length):
            token = tokenized[i]
            if not self.tokenizer.is_bad_token(token):
                previous = None
                next = None
                if i != 0:
                    previous = tokenized[i-1]
                    if self.tokenizer.is_bad_token(previous):
                        previous = None

                if i < (length - 1):
                    next = tokenized[i+1]
                    if self.tokenizer.is_bad_token(next):
                        next = None

                self.persister.add(main=token, previous=previous, next=next, id=id)
예제 #4
0
class Pairer(object):
    def __init__(self, num_tweets=False, debug=False, time=False):

        self.collection = db.twictionary_models_tweets
        self.tres = db.twictionary_models_tres

        opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False)
        if num_tweets:
            opts['limit'] = num_tweets

        if debug or time:
            opts['limit'] = 5000

        self.tweets = self.collection.find(**opts)
        self.persister = Persister(debug=debug)
        self.debug = debug
        self.time = time
        self.tokenizer = TwitterTokenizer()

        self.run()

    def run(self):
        total = 0
        start_time = time.time()
        end_time = time.time()
        if self.time:
            start_time = time.time()

        for t in self.tweets:
            if total % 100000 == 0:
                end_time = time.time()
                print "processed %s documents!" % total
                print "these documents took %s seconds to process" % (
                    end_time - start_time)
                start_time = time.time()

            self.pair(t)
            total += 1

        if self.time:
            end_time = time.time()
            print "Run time: %s" % (end_time - start_time)
            self.tres.remove({})

    def pair(self, tweet):
        text = tweet['text']
        id = tweet['_id']
        tokenized = self.tokenizer.tokenize(text)
        length = len(tokenized)
        for i in range(length):
            token = tokenized[i]
            if not self.tokenizer.is_bad_token(token):
                previous = None
                next = None
                if i != 0:
                    previous = tokenized[i - 1]
                    if self.tokenizer.is_bad_token(previous):
                        previous = None

                if i < (length - 1):
                    next = tokenized[i + 1]
                    if self.tokenizer.is_bad_token(next):
                        next = None

                self.persister.add(main=token,
                                   previous=previous,
                                   next=next,
                                   id=id)
예제 #5
0
    def normalize_mentions(self, s, repl_func='@MENTION'):
        return MENTION_REGEX.sub(repl_func, s)

    def normalize_repeated_chars(self, s):
        return REPEATED_CHAR_REGEX.sub(r'\1\1\1', s)


if __name__ == '__main__':

    print reduce(lambda x, y: '{0}({1})'.format(y, x), ['f', 'g', 'h'], 'x')

    preprocessor = TwitterTextPreprocessor()

    from tokenizer import TwitterTokenizer

    tok = TwitterTokenizer()

    from pymongo import MongoClient

    client = MongoClient()
    db = client.twitter_database
    db_labeled_tweets = db.labeled_tweets

    for tweet in db_labeled_tweets.find({u'text': {'$exists': True}}):
        text = tweet.get(u'text')
        print tweet.get(u'_id')
        print text
        # print decode_html_entities(text)
        # print normalize_urls(text)
        # print normalize_repeated_chars(text)
        # print normalize_mentions(text)