Пример #1
0
 def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0):
     if not listtweets:
         returnD(False)
     if query:
         if not isinstance(listtweets, dict):
             returnD(False)
         nexturl = ""
         if 'max_id_str' in listtweets['search_metadata']:
             nexturl = listtweets['search_metadata']['max_id_str']
         elif 'next_results' in listtweets['search_metadata']:
             nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results'])
         res = {'nexturl':  nexturl}
         listtweets = listtweets['statuses']
     elif not isinstance(listtweets, list):
         returnD(False)
     feed = []
     for tweet in listtweets:
         if not isinstance(tweet, dict):
             continue
         tw = {'created_at': tweet['created_at'], 'title': unescape_html(tweet['text']), 'link': tweet['url']}
         tw = grab_extra_meta(tweet, tw)
         feed.append(tw)
     if query:
         res['tweets'] = feed
         processed = yield self.process_tweets(res, 'search', query=query, pagecount=pagecount)
     else:
         processed = yield self.process_tweets(feed, 'my%s' % feedtype)
     returnD(processed)
Пример #2
0
 def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0):
     if not listtweets:
         returnD(False)
     if query:
         if not isinstance(listtweets, dict):
             returnD(False)
         nexturl = ""
         if 'max_id_str' in listtweets['search_metadata']:
             nexturl = listtweets['search_metadata']['max_id_str']
         elif 'next_results' in listtweets['search_metadata']:
             nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results'])
         res = {'nexturl':  nexturl}
         listtweets = listtweets['statuses']
     elif not isinstance(listtweets, list):
         returnD(False)
     feed = []
     for tweet in listtweets:
         if not isinstance(tweet, dict):
             continue
         if 'entities' in tweet:
             entities = []
             for entitype in ['media', 'urls']:
                 if entitype in tweet['entities']:
                     entities += tweet['entities'][entitype]
             for entity in entities:
               try:
                 if 'expanded_url' in entity and 'url' in entity and entity['expanded_url'] and entity['url'] not in self.fact.cache_urls and len(entity['expanded_url']) < 250:
                     cleaned, self.fact.cache_urls = clean_url(entity['expanded_url'].encode('utf-8'), entity['url'].encode('utf-8'), self.fact.cache_urls)
                     _, self.fact.cache_urls = yield clean_redir_urls(cleaned.decode('utf-8'), self.fact.cache_urls)
               except Exception as e:
                  self.log(e, error=True)
         if "retweeted_status" in tweet and tweet['retweeted_status']['id_str'] != tweet['id_str']:
             text = "RT @%s: %s" % (tweet['retweeted_status']['user']['screen_name'], tweet['retweeted_status']['text'])
         else:
             text = tweet['text']
         tw = {'created_at': tweet['created_at'], 'title': unescape_html(text), 'link': "https://twitter.com/%s/status/%s" % (tweet['user']['screen_name'], tweet['id_str'])}
         tw = grab_extra_meta(tweet, tw)
         feed.append(tw)
     if query:
         res['tweets'] = feed
         processed = yield self.process_tweets(res, 'search', query=query, pagecount=pagecount)
     else:
         processed = yield self.process_tweets(feed, 'my%s' % feedtype)
     returnD(processed)
Пример #3
0
 def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0):
     if not listtweets:
         return None
     if query:
         if not isinstance(listtweets, dict):
             return None
         nexturl = ""
         if 'next_results' in listtweets['search_metadata']:
             nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results'])
         res = {'nexturl':  nexturl}
         listtweets = listtweets['statuses']
     feed = []
     for tweet in listtweets:
         if "retweeted_status" in tweet and tweet['retweeted_status']['id_str'] != tweet['id_str']:
             text = "RT @%s: %s" % (tweet['retweeted_status']['user']['screen_name'], tweet['retweeted_status']['text'])
         else:
             text = tweet['text']
         tw = {'created_at': tweet['created_at'], 'title': unescape_html(text), 'link': "http://twitter.com/%s/statuses/%s" % (tweet['user']['screen_name'], tweet['id_str'])}
         tw = grab_extra_meta(tweet, tw)
         feed.append(tw)
     if query:
         res['tweets'] = feed
         return self.process_tweets(res, 'search', query=query, pagecount=pagecount)
     return self.process_tweets(feed, 'my%s' % feedtype)
Пример #4
0
    def process_tweets(self, feed, source, query=None, pagecount=0):
        # handle tweets from icerocket or topsy fake rss
        nexturl = ""
        try:
            elements = feed.entries
        except:
        # handle tweets from Twitter API
            if isinstance(feed, list) and len(feed):
                elements = feed
            elif isinstance(feed, dict) and "nexturl" in feed:
                nexturl = feed["nexturl"]
                elements = feed["tweets"]
            else:
                returnD(False)
        if query:
            source = "%s https://api.twitter.com/api/1.1/search/tweets.json?q=%s" % (source, query)
        ids = []
        hashs = []
        tweets = []
        fresh = True
        for i in elements:
            try:
                time_tweet = time.mktime(i.get('published_parsed', '')) - 4*60*60
            except:
                if i.get('created_at', '') == "now":
                    time_tweet = time.time()
                else:
                    time_tweet = time.mktime(time.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y')) + 2*60*60
            date = datetime.fromtimestamp(time_tweet)
            if datetime.today() - date > timedelta(hours=config.BACK_HOURS):
                fresh = False
                break
            tweet, self.fact.cache_urls = yield clean_redir_urls(i.get('title', '').replace('\n', ' '), self.fact.cache_urls)
            link = i.get('link', '')
            res = re_tweet_url.search(link)
            if res:
                user = res.group(1)
                tid = long(res.group(2))
                ids.append(tid)
                tw = {'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user.lower(), 'screenname': user, 'message': tweet, 'uniq_rt_hash': uniq_rt_hash(tweet), 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': source}
                tw = grab_extra_meta(i, tw)
                tweets.append(tw)
        # Delay displaying to avoid duplicates from the stream
        if source != "mystream" and not self.fact.tweets_search_page:
            yield deferredSleep()
        existings = yield self.fact.db['tweets'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], filter=sortdesc('id'))
        existing = [t['_id'] for t in existings]
        news = [t for t in tweets if t['_id'] not in existing]
        if not news:
            returnD(False)
        good = []
        news.sort(key=itemgetter('id'))
        if fresh and not source.startswith("my") and len(news) > len(elements) / 2:
            if query and nexturl and pagecount < 3*self.fact.back_pages_limit:
                deferToThreadPool(reactor, self.threadpool, reactor.callLater, 15, self.start_twitter_search, [query], max_id=nexturl, pagecount=pagecount+1)
            elif not query and nexturl and "p=%d" % (self.fact.back_pages_limit+1) not in nexturl and "page=%s" % (2*self.fact.back_pages_limit) not in nexturl:
                deferToThreadPool(reactor, self.threadpool, reactor.callLater, 41, self.start, nexturl)
            elif not query and not nexturl and int(source[-1:]) <= self.fact.back_pages_limit:
                deferToThreadPool(reactor, self.threadpool, reactor.callLater, 41, self.start, next_page(source))
        if self.fact.displayRT:
            good = news
        else:
            hashs = [t['uniq_rt_hash'] for t in news if t['uniq_rt_hash'] not in hashs]
            existings = yield self.fact.db['tweets'].find({'channel': self.fact.channel, 'uniq_rt_hash': {'$in': hashs}}, fields=['uniq_rt_hash'], filter=sortdesc('id'))
            existing = [t['uniq_rt_hash'] for t in existings]

            for t in news:
                if self.fact.twuser == t['user'] or t['uniq_rt_hash'] not in existing or (self.fact.displayMyRT and "@%s" % self.fact.twuser in t['message'].lower()):
                    existing.append(t['uniq_rt_hash'])
                    good.append(t)
        if config.DEBUG:
            nb_rts_str = ""
            nb_rts = len(news) - len(good)
            if nb_rts:
                nb_rts_str = " (%s RTs filtered)" % nb_rts
            self.log("Displaying %s tweets%s" % (len(good), nb_rts_str), hint=True)
        if self.fact.status != "closed":
            for t in good:
                msg = "%s: %s" % (t['screenname'].encode('utf-8'), self.format_tweet(t))
                self.fact.ircclient._send_message(msg, self.fact.channel)
        for t in news:
            yield self.fact.db['tweets'].save(t, safe=True)
        returnD(True)
Пример #5
0
    def process_tweets(self, feed, source, query=None, pagecount=0):
        # handle tweets from icerocket or topsy fake rss
        nexturl = ""
        try:
            elements = feed.entries
        except:
            # handle tweets from Twitter API
            if isinstance(feed, list) and len(feed):
                elements = feed
            elif isinstance(feed, dict) and "nexturl" in feed:
                nexturl = feed["nexturl"]
                elements = feed["tweets"]
            else:
                returnD(False)
        if query:
            source = "%s https://api.twitter.com/api/1.1/search/tweets.json?q=%s" % (
                source, query)
        ids = []
        hashs = []
        tweets = []
        fresh = True
        for i in elements:
            try:
                date = datetime.fromtimestamp(
                    time.mktime(i.get('published_parsed', '')) - 4 * 60 * 60)
            except:
                if i.get('created_at', '') == "now":
                    date = datetime.now()
                else:
                    #date = datetime.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y') + timedelta(hours=2)
                    date = parse_date(i.get('created_at', ''))
            if datetime.today() - date > timedelta(hours=config.BACK_HOURS):
                fresh = False
                break
            tweet, self.fact.cache_urls = yield clean_redir_urls(
                i.get('title', '').replace('\n', ' '), self.fact.cache_urls)
            link = i.get('link', '')
            res = re_tweet_url.search(link)
            if res:
                user = res.group(1)
                tid = long(res.group(2))
                ids.append(tid)
                tw = {
                    '_id': "%s:%s" % (self.fact.channel, tid),
                    'channel': self.fact.channel,
                    'id': tid,
                    'user': user.lower(),
                    'screenname': user,
                    'message': tweet,
                    'uniq_rt_hash': uniq_rt_hash(tweet),
                    'link': link,
                    'date': date,
                    'timestamp': datetime.today(),
                    'source': source
                }
                tw = grab_extra_meta(i, tw)
                tweets.append(tw)
        # Delay displaying to avoid duplicates from the stream
        if source != "mystream" and not self.fact.tweets_search_page:
            yield deferredSleep()
        existings = yield self.fact.db['tweets'].find(
            {
                'channel': self.fact.channel,
                'id': {
                    '$in': ids
                }
            },
            fields=['_id'],
            filter=sortdesc('id'))
        existing = [t['_id'] for t in existings]
        news = [t for t in tweets if t['_id'] not in existing]
        if not news:
            returnD(False)
        good = []
        news.sort(key=itemgetter('id'))
        if fresh and not source.startswith(
                "my") and len(news) > len(elements) / 2:
            if query and nexturl and pagecount < 3 * self.fact.back_pages_limit:
                deferToThreadPool(reactor,
                                  self.threadpool,
                                  reactor.callLater,
                                  15,
                                  self.start_twitter_search, [query],
                                  max_id=nexturl,
                                  pagecount=pagecount + 1)
            elif not query and nexturl and "p=%d" % (
                    self.fact.back_pages_limit +
                    1) not in nexturl and "page=%s" % (
                        2 * self.fact.back_pages_limit) not in nexturl:
                deferToThreadPool(reactor, self.threadpool, reactor.callLater,
                                  41, self.start_web, nexturl)
            elif not query and not nexturl and int(
                    source[-1:]) <= self.fact.back_pages_limit:
                deferToThreadPool(reactor, self.threadpool, reactor.callLater,
                                  41, self.start_web, next_page(source))
        if self.fact.displayRT:
            good = news
        else:
            hashs = [
                t['uniq_rt_hash'] for t in news
                if t['uniq_rt_hash'] not in hashs
            ]
            existings = yield self.fact.db['tweets'].find(
                {
                    'channel': self.fact.channel,
                    'uniq_rt_hash': {
                        '$in': hashs
                    }
                },
                fields=['uniq_rt_hash'],
                filter=sortdesc('id'))
            existing = [t['uniq_rt_hash'] for t in existings]

            for t in news:
                if self.fact.twuser == t['user'] or t[
                        'uniq_rt_hash'] not in existing or (
                            self.fact.displayMyRT and "@%s" % self.fact.twuser
                            in t['message'].lower()):
                    existing.append(t['uniq_rt_hash'])
                    good.append(t)
        if config.DEBUG:
            nb_rts_str = ""
            nb_rts = len(news) - len(good)
            if nb_rts:
                nb_rts_str = " (%s RTs filtered)" % nb_rts
            self.log("Displaying %s tweets%s" % (len(good), nb_rts_str),
                     hint=True)
        if self.fact.status != "closed":
            for t in good:
                msg = "%s: %s" % (t['screenname'].encode('utf-8'),
                                  self.format_tweet(t))
                self.fact.ircclient._send_message(msg, self.fact.channel)
        for t in news:
            yield self.fact.db['tweets'].save(t, safe=True)
        returnD(True)
Пример #6
0
 def process_tweets(self, feed, source, query=None, pagecount=0):
     # handle tweets from icerocket or topsy fake rss
     nexturl = ""
     try:
         elements = feed.entries
     except:
     # handle tweets from Twitter API
         if isinstance(feed, list) and len(feed):
             elements = feed
         elif isinstance(feed, dict) and "nexturl" in feed:
             nexturl = feed["nexturl"]
             elements = feed["tweets"]
         else:
             defer.returnValue(None)
     if query:
         source = "%s https://api.twitter.com/api/1.1/search/tweets.json?q=%s" % (source, query)
     ids = []
     hashs = []
     tweets = []
     fresh = True
     for i in elements:
         try:
             time_tweet = time.mktime(i.get('published_parsed', '')) - 4*60*60
         except:
             if i.get('created_at', '') == "now":
                 time_tweet = time.time()
             else:
                 time_tweet = time.mktime(time.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y')) + 2*60*60
         date = datetime.fromtimestamp(time_tweet)
         if datetime.today() - date > timedelta(hours=config.BACK_HOURS):
             fresh = False
             break
         tweet, self.fact.cache_urls = yield clean_redir_urls(i.get('title', '').replace('\n', ' '), self.fact.cache_urls, pool=self.threadpool)
         tweet = tweet.replace('&#126;', '~')
         link = i.get('link', '')
         res = re_tweet_url.search(link)
         if res:
             user = res.group(1)
             tid = long(res.group(2))
             ids.append(tid)
             tw = {'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user.lower(), 'screenname': user, 'message': tweet, 'uniq_rt_hash': uniq_rt_hash(tweet), 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': source}
             tw = grab_extra_meta(i, tw)
             tweets.append(tw)
     existing = [t['_id'] for t in self.db['tweets'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], sort=[('id', pymongo.DESCENDING)])]
     news = [t for t in tweets if t['_id'] not in existing]
     if news:
         good = 0
         news.reverse()
         if fresh and not source.startswith("my") and len(news) > len(elements) / 2:
             if query and nexturl and pagecount < self.fact.back_pages_limit:
                 yield self.start_twitter_search([query], max_id=nexturl, pagecount=pagecount+1)
             elif not query and nexturl and "p=%d" % (self.fact.back_pages_limit+1) not in nexturl and "page=%s" % (2*self.fact.back_pages_limit) not in nexturl:
                 reactor.callFromThread(reactor.callLater, 41, self.start, nexturl)
             elif not query and not nexturl and int(source[-1:]) <= self.fact.back_pages_limit:
                 reactor.callFromThread(reactor.callLater, 41, self.start, next_page(source))
         if not self.fact.displayRT:
             hashs = [t['uniq_rt_hash'] for t in news if t['uniq_rt_hash'] not in hashs]
             existing = [t['uniq_rt_hash'] for t in self.db['tweets'].find({'channel': self.fact.channel, 'uniq_rt_hash': {'$in': hashs}}, fields=['uniq_rt_hash'], sort=[('id', pymongo.DESCENDING)])]
             for t in news:
                 if t['uniq_rt_hash'] not in existing:
                     existing.append(t['uniq_rt_hash'])
                     self.displayTweet(t)
                     good += 1
         else:
             [self.displayTweet(t) for t in news]
         if config.DEBUG:
             nb_rts_str = ""
             nb_rts = len(news) - good
             if nb_rts:
                 nb_rts_str = " (%s RTs filtered)" % nb_rts
             self.log("Displaying %s tweets%s" % (good, nb_rts_str), self.fact.database, hint=True)
         self.db['tweets'].insert(news, continue_on_error=True, safe=True)
     defer.returnValue(None)