Пример #1
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ""
            json_resp = None
        else:
            html = ""
            try:
                json_resp = json.loads(response.text)
                html = json_resp["items_html"] or ""
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = urllib.parse.quote(json_resp["min_position"])
            else:
                pos = None
            return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp["min_position"])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error("Giving up.")
    return [], None
Пример #2
0
    def query_single_page(self, url, html_response=True, retry=10):
        """
        Returns tweets from the given URL.

        :param url: The URL to get the tweets from
        :param html_response: False, if the HTML is embedded in a JSON
        :param retry: Number of retries if something goes wrong.
        :return: The list of tweets, the pos argument for getting the next page.
        """
        headers = {'User-Agent': random.choice(self.HEADERS_LIST)}

        try:

            if (self.proxy == None):
                response = requests.get(url, headers=headers)
            else:
                response = requests.get(url,
                                        proxies=self.proxy,
                                        headers=headers)

            if html_response:
                html = response.text or ''
            else:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''

            tweets = list(Tweet.from_html(html))

            if not tweets:
                return [], None

            if not html_response:
                return tweets, json_resp['min_position']

            return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
        except requests.exceptions.HTTPError as e:
            self.logger.exception('HTTPError {} while requesting "{}"'.format(
                e, url))
        except requests.exceptions.ConnectionError as e:
            self.logger.exception(
                'ConnectionError {} while requesting "{}"'.format(e, url))
        except requests.exceptions.Timeout as e:
            self.logger.exception('TimeOut {} while requesting "{}"'.format(
                e, url))
        except json.decoder.JSONDecodeError as e:
            self.logger.exception(
                'Failed to parse JSON "{}" while requesting "{}".'.format(
                    e, url))
        except ValueError as e:
            self.logger.exception(
                'Failed to parse JSON "{}" while requesting "{}"'.format(
                    e, url))

        if retry > 0:
            self.logger.info("Retrying... (Attempts left: {})".format(retry))
            return self.query_single_page(url, html_response, retry - 1)

        self.logger.error("Giving up.")
        return [], None
Пример #3
0
def query_single_page(url, html_response=True, retry=10, from_user=False):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        if html_response:
            html = response.text or ''
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None

        if not html_response:
            return tweets, urllib.parse.quote(json_resp['min_position'])

        if from_user:
            return tweets, tweets[-1].id
        else:
            return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(url, html_response, retry - 1)

    logger.error('Giving up.')
    return [], None
Пример #4
0
def query_single_page(url, html_response=True, retry=10):

    # Returns Tweets from the URL given

    headers = {'User-Agent': random.choice(HEADERS_LIST)}

    try:
        response = requests.get(url, headers=headers)
        #draws from database specific information
        if html_response:
            html = response.text
#inputs html received
        else:
            json_resp = response.json()
            html = json_resp['items_html']
#inputs json received

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None
#data validation for non existent

        if not html_response:
            return tweets, json_resp['min_position']
#data validation for non existent

        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
    except requests.exceptions.HTTPError as e:
        logging.exception('HTTPError {} while requesting "{}"'.format(e, url))

#checks for error
    except requests.exceptions.ConnectionError as e:
        logging.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logging.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logging.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logging.info("Retrying... (Attempts left: {})".format(retry))
        return query_single_page(url, html_response, retry - 1)

    logging.error("Quitting.")
    #errors when logging
    return [], None
Пример #5
0
def query_single_page(url, html_response=True, retry=10):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    headers = {'User-Agent': random.choice(HEADERS_LIST)}

    try:
        response = requests.get(url, headers=headers)
        if html_response:
            html = response.text
        else:
            json_resp = response.json()
            html = json_resp['items_html']

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None

        if not html_response:
            return tweets, json_resp['min_position']

        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
    except requests.exceptions.HTTPError as e:
        logging.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logging.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logging.exception('TimeOut {} while requesting "{}"'.format(
            e, url))
    if retry > 0:
        logging.info("Retrying...")
        return query_single_page(url, html_response, retry-1)

    logging.error("Giving up.")
    return [], None
Пример #6
0
def query_single_page(url, html_response=True, retry=3):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    headers = {'User-Agent': random.choice(HEADERS_LIST)}
    req = urllib.request.Request(url, headers=headers)
    try:
        response = urllib.request.urlopen(req).read().decode()
        if html_response:
            html = response
        else:
            json_resp = json.loads(response)
            html = json_resp['items_html']

        tweets = list(Tweet.from_html(html))
        if not tweets:
            return [], None

        if not html_response:
            return tweets, json_resp['min_position']

        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
    except urllib.request.HTTPError as e:
        logging.exception('HTTPError {} while requesting "{}"'.format(
            e.code, url))
    except urllib.request.URLError as e:
        logging.exception('URLError {} while requesting "{}"'.format(
            e.reason, url))

    if retry > 0:
        logging.info("Retrying...")
        return query_single_page(url, html_response, retry - 1)

    logging.error("Giving up.")
    return [], None
Пример #7
0
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)
    logger.info('Scraping tweets from {}'.format(url))

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url,
                                headers=HEADER,
                                proxies={"http": proxy},
                                timeout=timeout)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            try:
                if json_resp:
                    pos = json_resp['min_position']
                    has_more_items = json_resp['has_more_items']
                    if not has_more_items:
                        logger.info("Twitter returned : 'has_more_items' ")
                        return [], None
                else:
                    pos = None
            except:
                pass
            if retry > 0:
                logger.info('Retrying... (Attempts left: {})'.format(retry))
                return query_single_page(query, lang, pos, retry - 1,
                                         from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].tweet_id
        return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id,
                                            tweets[0].tweet_id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None
Пример #8
0
    def from_soup(cls, soup):
        try:
            sideBar = soup.find('div', 'ProfileHeaderCard')
        except:
            sideBar = ""

        try:
            username = sideBar.find('span', 'username').get_text()
        except:
            username = ""

        try:
            topBar = soup.find('ul', 'ProfileNav-list')
        except:
            topBar = ""

        try:
            location = sideBar.find(
                'div', 'ProfileHeaderCard-location').get_text().strip() or 0
        except:
            location = 0

        try:
            has_avatar = 0 if 'default_profile_images' in soup.find(
                'img', 'ProfileAvatar-image')['src'] else 1
        except:
            has_avatar = ""

        try:
            joined = sideBar.find('span',
                                  'ProfileHeaderCard-joinDateText')['title']
            created = datetime.strptime(
                joined, "%I:%M %p - %d %b %Y").strftime("%Y-%m-%d")

        except Exception as e:
            print(str(e))
            created = 0

        try:
            soup.find('div', 'ProfileCanopy-headerBg').find('img')['src']
            has_background = 1
        except:
            has_background = 0

        try:
            a = soup.find('h2', 'ProtectedTimeline-heading')

            if a == None:
                protected = 0
            else:
                protected = 1
        except:
            protected = 0

        tweets = soup.find_all('div', 'tweet')
        all_tweets = []

        for tweet in tweets:

            if " Retweeted" not in tweet.get_text():
                all_tweets.append(Tweet.from_soup(tweet))

        try:
            isVerified = 0 if sideBar.find('span',
                                           'Icon--verified') == None else 1
        except:
            isVerified = 0

        try:
            total_tweets = topBar.find('li', 'ProfileNav-item--tweets').find(
                'span', 'ProfileNav-value')['data-count'] or 0
        except:
            total_tweets = 0

        try:
            total_following = topBar.find(
                'li', 'ProfileNav-item--following').find(
                    'span', 'ProfileNav-value')['data-count'] or 0
        except:
            total_following = 0

        try:
            total_followers = topBar.find(
                'li', 'ProfileNav-item--followers').find(
                    'span', 'ProfileNav-value')['data-count'] or 0
        except:
            total_followers = 0

        try:
            total_likes = topBar.find('li', 'ProfileNav-item--favorites').find(
                'span', 'ProfileNav-value')['data-count'] or 0
        except:
            total_likes = 0

        return cls(username=username,
                   location=location,
                   has_location=0 if location == 0 else 1,
                   created=created,
                   is_verified=isVerified,
                   total_tweets=total_tweets,
                   total_following=total_following,
                   total_followers=total_followers,
                   total_likes=total_likes,
                   has_avatar=has_avatar,
                   has_background=has_background,
                   is_protected=protected,
                   profile_modified=1 if has_background == 1 or has_avatar == 1
                   or location != 0 else 0,
                   tweets=all_tweets)
Пример #9
0
    def __from_soup(self, tweet_div):
        # user name & id
        screen_name = tweet_div["data-screen-name"].strip('@')
        username = tweet_div["data-name"]
        user_id = tweet_div["data-user-id"]

        # tweet basic data
        tweet_id = tweet_div["data-tweet-id"]  # equal to 'data-item-id'
        tweet_url = tweet_div["data-permalink-path"]
        timestamp_epochs = int(
            tweet_div.find('span', '_timestamp')['data-time'])
        timestamp = datetime.datetime.utcfromtimestamp(timestamp_epochs)

        # tweet text
        soup_html = tweet_div \
            .find('div', 'js-tweet-text-container') \
            .find('p', 'tweet-text')
        text_html = str(soup_html) or ""
        text = soup_html.text or ""
        links = [
            atag.get('data-expanded-url', atag['href'])
            for atag in soup_html.find_all('a', class_='twitter-timeline-link')
            if 'pic.twitter' not in atag.text  # eliminate picture
        ]
        hashtags = [tag.strip('#') for tag in re.findall(r'#\w+', text)]

        # tweet media
        # --- imgs
        soup_imgs = tweet_div.find_all('div', 'AdaptiveMedia-photoContainer')
        img_urls = [img['data-image-url']
                    for img in soup_imgs] if soup_imgs else []

        # --- videos
        video_div = tweet_div.find('div', 'PlayableMedia-container')
        video_url = video_div.find(
            'div')['data-playable-media-url'] if video_div else ''
        has_media = True if img_urls or video_url else False

        # update 'links': eliminate 'video_url' from 'links' for duplicate
        links = list(filter(lambda x: x != video_url, links))

        # tweet actions numbers
        action_div = tweet_div.find('div', 'ProfileTweet-actionCountList')

        # --- likes
        likes = int(
            action_div.find('span', 'ProfileTweet-action--favorite').find(
                'span', 'ProfileTweet-actionCount')['data-tweet-stat-count']
            or '0')
        # --- RT
        retweets = int(
            action_div.find('span', 'ProfileTweet-action--retweet').find(
                'span', 'ProfileTweet-actionCount')['data-tweet-stat-count']
            or '0')
        # --- replies
        replies = int(
            action_div.find(
                'span', 'ProfileTweet-action--reply u-hiddenVisually').find(
                    'span',
                    'ProfileTweet-actionCount')['data-tweet-stat-count']
            or '0')
        is_replied = False if replies == 0 else True

        # detail of reply to others
        # - reply to others
        parent_tweet_id = tweet_div['data-conversation-id']  # parent tweet

        if tweet_id == parent_tweet_id:
            is_reply_to = False
            parent_tweet_id = ''
            reply_to_users = []
        else:
            is_reply_to = True
            soup_reply_to_users = \
                tweet_div.find('div', 'ReplyingToContextBelowAuthor') \
                .find_all('a')
            reply_to_users = [{
                'screen_name': user.text.strip('@'),
                'user_id': user['data-user-id']
            } for user in soup_reply_to_users]

        return self.__tweet_line(
            Tweet(screen_name, username, user_id, tweet_id, tweet_url,
                  timestamp, timestamp_epochs, text, text_html, links,
                  hashtags, has_media, img_urls, video_url, likes, retweets,
                  replies, is_replied, is_reply_to, parent_tweet_id,
                  reply_to_users))
Пример #10
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = json_resp['min_position']
            else:
                pos = None
            if retry > 0:
                return query_single_page(query, lang, pos, retry - 1, from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None