Пример #1
0
class VespaWrite:
    def open_spider(self, spider):
        self.vespa = Vespa(url = "http://vespa-search", port = 8080)
    
    def process_item(self, item, spider):
        try:
            vespa_fields = { }
            vespa_fields['url'] = item['url']
            vespa_fields['bodytext'] = item['bodytext']
            vespa_fields['firstpubtime'] = item['firstpubtime']
            if ('modtime' in item):
                vespa_fields['modtime'] = item['modtime']
            vespa_fields['wordcount'] = item['wordcount']
            vespa_fields['headline'] = item['headline']
            vespa_fields['sentiment'] = item['sentiment']
            if ('summary' in item):
                vespa_fields['abstract'] = item['summary']
            if ('keywords' in item):
                vespa_fields['keywords'] = item['keywords']
            if ('bylines' in item):
                vespa_fields['bylines'] = item['bylines']
            if ('section' in item):
                vespa_fields['section'] = item['section']
            vespa_fields['source'] = item['source']
            if ('twitter_retweet_count' in item):
                vespa_fields['twitter_retweet_count'] = item['twitter_retweet_count']
            if ('twitter_favourite_count' in item):
                vespa_fields['twitter_favourite_count'] = item['twitter_favourite_count']

            response = self.vespa.update_data(
                schema = "newsarticle",
                data_id = hashlib.sha256(item['url'].encode()).hexdigest(),
                fields = vespa_fields,
                create = True
            )
            return item
        except (KeyError, TypeError):
            logger.debug("error: " + item)
            pass
Пример #2
0
class TwitterInserter:
    def run(self):
        try:
            config.load_incluster_config()
        except:
            config.load_kube_config()
        v1 = client.CoreV1Api()
        twitter_secrets = v1.read_namespaced_secret(name='twitter-secrets',
                                                    namespace='default').data
        api_key = base64.b64decode(twitter_secrets["api-key"]).decode('utf-8')
        api_secret = base64.b64decode(
            twitter_secrets["api-secret"]).decode('utf-8')
        self.vespa = Vespa(url="http://vespa-search", port=8080)
        auth = tweepy.AppAuthHandler(api_key, api_secret)
        self.api = tweepy.API(auth)
        updated = 0
        for userid in [
                'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage',
                'canberratimes', 'zdnetaustralia', 'newscomauHQ',
                'westaustralian', 'SBSNews', 'australian', 'crikey_news',
                '9NewsAUS', 'BBCNewsAus'
        ]:
            try:
                for status in tweepy.Cursor(self.api.user_timeline,
                                            id=userid,
                                            include_entities=True,
                                            tweet_mode="extended").items(60):
                    if len(status.entities['urls']) == 0:
                        continue
                    url = status.entities['urls'][0]['expanded_url']
                    url = self.get_url(url)
                    if (url.startswith("https://twitter.com")
                            or url.startswith("https://www.reddit.com")):
                        continue
                    article = self.get_article(url)
                    if article:
                        self.update_document(article, status)
                        updated += 1
#                    else:
#                        self.insert_document(url)
            except Exception as e:
                print("exception! {}".format(e))
                continue
        print("Completed run, updated {} tweets".format(updated))

    def get_url(self, url):
        if (re.match(r'https?://zd.net', url)
                or url.startswith("https://trib.al")
                or url.startswith("https://bit.ly")
                or url.startswith("https://bbc.in")):
            url = urlopen(url).geturl()
            return self.get_url(url)
        else:
            return url.split('?')[0]


#    def insert_document(self, url):
#        payload = {'url': url }
#        requests.get("http://localhost:8000/", params=payload)
#        print("Hit spider url for {}".format(url))

    def update_document(self, article, status):
        vespa_fields = {}
        vespa_fields['twitter_favourite_count'] = status.favorite_count
        vespa_fields['twitter_retweet_count'] = status.retweet_count
        vespa_fields[
            'twitter_link'] = 'https://twitter.com/{}/status/{}'.format(
                status.user.screen_name, status.id)
        response = self.vespa.update_data(
            schema="newsarticle",
            data_id=hashlib.sha256(
                article['fields']['url'].encode()).hexdigest(),
            fields=vespa_fields)
        print("Updated {} with {} {}: {}".format(article['fields']['url'],
                                                 status.favorite_count,
                                                 status.retweet_count,
                                                 response))

    def get_article(self, url):
        article_time = time.time() - 24 * 60 * 60
        body = {
            'yql': 'select url from sources newsarticle where userQuery();',
            'query': "url:{}".format(url),
            'hits': 1,
        }
        results = self.vespa.query(body=body)
        if len(results.hits) > 0:
            return results.hits[0]
Пример #3
0
class TwitterInserter:
    api_key = "TWITTER_API_KEY"
    api_secret = "TWITTER_API_SECRET"

    def run(self):
        self.vespa = Vespa(url="http://vespa-search", port=8080)
        auth = tweepy.AppAuthHandler(self.api_key, self.api_secret)
        self.api = tweepy.API(auth)
        updated = 0
        for userid in [
                'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage',
                'canberratimes', 'zdnetaustralia', 'newscomauHQ',
                'westaustralian'
        ]:
            try:
                for status in tweepy.Cursor(self.api.user_timeline,
                                            id=userid,
                                            include_entities=True).items(60):
                    if len(status.entities['urls']) == 0:
                        continue
                    url = status.entities['urls'][0]['expanded_url']
                    url = url.split('?')[0]
                    if (url.startswith("https://twitter.com")):
                        continue
                    if (url.startswith("https://zd.net")
                            or url.startswith("https://bit.ly")):
                        url = urlopen(url).geturl()
                    article = self.get_article(url)
                    if article:
                        self.update_document(article, status)
                        updated += 1
            except Exception as e:
                logger.error(e)
        print("Completed run, updated {} tweets".format(updated))

    def update_document(self, article, status):
        vespa_fields = {}
        vespa_fields['twitter_favourite_count'] = status.favorite_count
        vespa_fields['twitter_retweet_count'] = status.retweet_count
        response = self.vespa.update_data(
            schema="newsarticle",
            data_id=hashlib.sha256(
                article['fields']['url'].encode()).hexdigest(),
            fields=vespa_fields)
        #print("Updated {} with {} {}: {}".format(article['fields']['url'], status.favorite_count, status.retweet_count, response))

    def get_article(self, url):
        article_time = time.time() - 24 * 60 * 60
        body = {
            'yql': 'select url from sources newsarticle where userQuery();',
            'query': "url:{}".format(url),
            'hits': 1,
        }
        results = self.vespa.query(body=body)
        if len(results.hits) > 0:
            return results.hits[0]

    def get_twitter_user(self, url):
        if url.startswith("https://www.abc.net.au"):
            return "abcnews"
        if url.startswith("https://www.theguardian.com/"):
            return "GuardianAus"
        if url.startswith("https://www.smh.com.au"):
            return "smh"
        if url.startswith("https://www.itnews.com.au"):
            return "iTnews_au"
        if url.startswith("https://www.theage.com.au"):
            return "theage"
        if url.startswith("https://www.canberratimes.com.au"):
            return "canberratimes"
        if url.startswith("https://www.zdnet.com"):
            return "zdnetaustralia"
        if url.startswith("https://www.news.com.au"):
            return "newscomauHQ"
        if url.startswith("https://thewest.com.au"):
            return "westaustralian"