def fetch_posts(self, query: dict, start_date: datetime, end_date: datetime) -> list: """Fetch tweets from archive.org that match the given query for the given day.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') enc_query = urlencode({ 'q': query, 'date_from': start_arg, 'date_to': end_arg }) url = "https://searchtweets.archivelab.org/export?" + enc_query log.debug("archive.org url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsArchiveTwitterDataException( "error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() # sometimes we get null characters, which choke the csv module decoded_content = decoded_content.replace('\x00', '') meta_tweets = [] lines = decoded_content.splitlines()[1:] for row in csv.reader(lines, delimiter="\t"): fields = 'user_name user_screen_name lang text timestamp_ms url'.split( ' ') meta_tweet = {} for i, field in enumerate(fields): meta_tweet[field] = row[i] if i < len(row) else '' if 'url' not in meta_tweet or meta_tweet['url'] == '': log.warning("meta_tweet '%s' does not have a url" % str(row)) continue meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url']) meta_tweets.append(meta_tweet) add_tweets_to_meta_tweets(meta_tweets) return meta_tweets
def fetch_posts_from_api(self, query: str, start_date: datetime, end_date: datetime) -> list: """Fetch day of tweets from crimson hexagon and twitter.""" meta_tweets = [] next_cursor = None while True: data = self._fetch_posts_from_api_single_page( query, start_date, end_date, next_cursor) meta_tweets = meta_tweets + data['results'] if 'nextCursor' in data: next_cursor = data['nextCursor'] else: break if 'results' not in data: raise McPostsBWTwitterDataException("Unknown response status: " + str(data)) for mt in meta_tweets: try: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) except McTwitterUrlException: raise McPostsBWTwitterQueryException(""" Unable to parse tweet url %s. Make sure brandwatch query only includes twitter as a source. """ % mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.debug("mt: %d" % mt['tweet_id']) if 'tweet' in mt: publish_date = dateutil.parser.parse( mt['tweet']['created_at']).isoformat() post = { 'post_id': str(mt['tweet_id']), 'data': mt, 'content': mt['tweet']['text'], 'publish_date': publish_date, 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts
def fetch_posts_from_api( self, query: str, start_date: datetime, end_date: datetime, sample: Optional[int] = None, page_size: Optional[int] = None, ) -> list: """Fetch day of tweets from crimson hexagon and twitter.""" decoded_content = self._get_content_from_api(query, start_date, end_date) assert sample is None, "Sampling is not implemented." assert page_size is None, "Page size limiting is not supported." data = dict(decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McPostsCHTwitterDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.debug("mt: %d" % mt['tweet_id']) if 'tweet' in mt: publish_date = dateutil.parser.parse( mt['tweet']['created_at']).isoformat() post = { 'post_id': str(mt['tweet_id']), 'data': mt, 'content': mt['tweet']['text'], 'publish_date': publish_date, 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts
def fetch_posts_from_api( self, query: str, start_date: datetime, end_date: datetime, sample: Optional[int] = None, page_size: Optional[int] = None, ) -> list: """Fetch day of tweets.""" if page_size is None: page_size = 5000 meta_tweets = [] next_cursor = None while True: data = self._fetch_posts_from_api_single_page( query=query, start_date=start_date, end_date=end_date, next_cursor=next_cursor, page_size=page_size, ) meta_tweets = meta_tweets + data['results'] log.debug(f"Sample: {sample}; meta_tweets: {len(meta_tweets)}") if 'nextCursor' not in data or (sample is not None and len(meta_tweets) >= sample): break else: next_cursor = data['nextCursor'] if 'results' not in data: raise McPostsBWTwitterDataException("Unknown response status: " + str(data)) for mt in meta_tweets: try: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) except McTwitterUrlException: raise McPostsBWTwitterQueryException( """ Unable to parse tweet url %s. Make sure brandwatch query only includes twitter as a source. """ % mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.debug("mt: %d" % mt['tweet_id']) if 'tweet' in mt: publish_date = dateutil.parser.parse(mt['tweet']['created_at']).isoformat() post = { 'post_id': str(mt['tweet_id']), 'data': mt, 'content': mt['tweet']['text'], 'publish_date': publish_date, 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts
def fetch_posts(self, query: str, start_date: datetime, end_date: datetime) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McPostsCHTwitterDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.warning("mt: %d" % mt['tweet_id']) if 'tweet' in mt: post = { 'post_id': mt['tweet_id'], 'data': mt, 'content': mt['tweet']['text'], 'publish_date': mt['tweet']['created_at'], 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts