示例#1
0
class TwitterDownloader:

    def __init__(self, path: PathGenerator, proxies: dict = None, retry=RETRY,
                 logger=None, session: Session = None):
        self.path = path
        self.logger = logger if logger is not None else Log.create_logger('TwitterSpider', './twitter.log')
        self.session = Session(proxies=proxies, retry=retry) if session is None else session

    def _get(self, url):
        r = self.session.get(url=url)
        return r.content

    def _save(self, content, path):
        if os.path.exists(path):
            self.logger.warning('File %s exists.', path)
            return False
        with open(path, 'wb') as f:
            f.write(content)
        return True

    def download(self, tweet: Tweet):
        user = tweet.user
        for media in tweet.medias:
            # def path(self, file_name, media_type, media_id, media_url, user_id, user_name, screen_name)
            path = self.path.path(file_name=media.file_name, media_type=media.type, media_id=media.id,
                                  media_url=media.url, user_id=user.id, user_name=user.name, screen_name=user.nickname)
            self._save(self._get(media.url), path)
示例#2
0
class DoubanSpider:

    def __init__(self, session: Session = None):
        self.session = Session(retry=5, timeout=10) if session is None else session
        self.ENABLE_BROTLI = version.parse(urllib3.__version__) < version.parse('1.25.1')

    def list(self, tags: List[str] = None, sort: str = 'U', start: int = 0, limit: int = 100000):
        """
        Return the list of URLs.
        :param sort: U - 近期热门, T - 标记最多, S - 评分最高, R - 最新上映
        :param tags: All the tags showed on the page
        :param start: start offset
        :param limit: limit to end
        :return:
        """
        url = 'https://movie.douban.com/j/new_search_subjects'
        while start < limit:
            params = {
                'sort': sort,
                'range': '0, 10',
                'tags': ','.join(tags) if tags is not None else '',
                'start': start
            }
            text = self._get(url, params=params, headers=HEADERS['api'])
            data = json.loads(text)['data']
            for item in data:
                yield item['url']
            time.sleep(2)

    def _get(self, url, **kwargs):
        r = self.session.get(url, **kwargs)
        if r.headers['Content-Encoding'] == 'br' and self.ENABLE_BROTLI:
            return brotli.decompress(r.content).decode('utf-8')
        else:
            return r.text

    def access_brief(self, url):
        """
        Crawl the brief page
        :param url:
        :return:
        """
        text = self._get(url, headers=HEADERS['page'])
        soup = Soup(text, 'lxml')
        content = soup.find('div', id='content')
        selector = Selector(text)
        return content, selector

    def access_celebrity(self, movie_id):
        pass

    def access_comment(self, movie_id, start=0, sort='new_score', status='P'):
        pass

    def access_review(self, movie_id, start=0):
        pass

    def access_full_text(self, url):
        pass
示例#3
0
class WeiboSpider:

    def __init__(self, db: Database = None,
                 path: PathGenerator = None,
                 session: Session = None,
                 auth: Auth = None):
        self.db = MongoDB('weibo', primary_key='id') if db is None else db
        self.path = StoreByUserName('./download') if path is None else path
        self.session = Session(timeout=10, retry=5) \
            if session is None else session

        if auth is None:
            auth = Auth()
        self.token = auth.token.token
        self.client = Client()

    def list(self, page=1):
        items = []
        running = True
        while running:
            data = self.client.favorites.get(access_token=self.token, page=page)
            if len(data.favorites) <= 0:
                break
            for item in data.favorites:
                if item.status.id not in self.db:
                    items.append(item.status)
                else:
                    running = False
                    break
            page += 1
        items.reverse()
        return items

    def download(self, status):
        if 'deleted' not in status:
            user = status.user.name
            for item in status.pic_urls:
                url = item.thumbnail_pic.replace('thumbnail', 'large')
                path = self.path.generate(user_name=user,
                                          media_type=MediaType.image)
                r = self.session.get(url)
                with open(path, 'wb') as f:
                    f.write(r.content)
        self.db.add(status.id)
示例#4
0
class CboooSpider:
    def __init__(self):
        self.session = Session()
        self.logger = Log.create_logger('spider')

    def get_id(self, redis: RedisSet):
        start_page = config.Config.spider.start_page
        params = {
            'area': config.Config.spider.area,
            'type': 0,
            'year': 0,
            'initial': '全部',
            'pIndex': start_page
        }

        res = self.session.get(url=url('/Mdata/getMdata_movie'), params=params)
        data = json.loads(res.text)
        self.logger.info('Total: {0} pages, {1} items'.format(
            data['tPage'], data['tCount']))
        end_page = data['tPage']
        for item in data['pData']:
            redis.add(item['ID'])
        self.logger.info('Page {}'.format(start_page))
        time.sleep(10)
        for i in range(start_page + 1, end_page + 1):
            params['pIndex'] = i
            res = self.session.get(url=url('/Mdata/getMdata_movie'),
                                   params=params)
            data = json.loads(res.text)
            for item in data['pData']:
                redis.add(item['ID'])
            self.logger.info('Page {}'.format(i))
            time.sleep(10)

    def start_crawl(self, extractor: Extractor, redis: RedisSet,
                    mongo: MongoDB):
        while not redis.empty():
            movie_id = redis.pop()
            self.logger.info('Movie ID: {}'.format(movie_id))
            try:
                info = self._crawl(movie_id, extractor)
                if info is not None:
                    if mongo.count({'id': movie_id}) <= 0:
                        mongo.insert(info)
                    else:
                        self.logger.info(
                            'Duplicate record {}'.format(movie_id))
                else:
                    self.logger.warning('Useless record {}'.format(movie_id))
            except NetworkException as e:
                self.logger.error(e)
                redis.add(movie_id)
            time.sleep(10)

    def _crawl(self, movie_id, extractor: Extractor):
        retry = MAX_RETRY
        while retry:
            try:
                res = self.session.get(url=url('/m/{}'.format(movie_id)))
                info = extractor.extract_info(res.text)
                if info is None:
                    return None
                res = self.session.get(url=url(
                    '/Mdata/getMovieEventAll?movieid={}'.format(movie_id)))
                info['event'] = extractor.extract_events(res.text)
                info['id'] = movie_id
                return info
            except (NetworkException, AttributeError) as e:
                self.logger.error(str(e))
                retry -= 1
                if retry <= 0:
                    raise RetryLimitExceededException(movie_id) from e
示例#5
0
class TwitterSpider:

    def __init__(self, token: str, proxies: dict = None, delay=DELAY, retry=RETRY,
                 logger=None, session: Session = None):
        self.base_url = 'https://api.twitter.com/1.1/'
        self.headers = {'Authorization': token}
        self.logger = logger if logger is not None else Log.create_logger('TwitterSpider', './twitter.log')
        self.delay = delay
        self.session = Session(retry=retry, proxies=proxies) if session is None else session

    def crawl_timeline(self, screen_name: str = None, user_id: str = None,
                       include_retweets: bool = True, exclude_replies: bool = True,
                       start_id=None, since_id=None, delay: float = None) -> Iterable[Tweet]:
        """

        :param screen_name:
        :param user_id:
        :param include_retweets:
        :param exclude_replies:
        :param start_id:
        :param since_id:
        :param delay:
        :return:
        """
        if delay is None:
            delay = self.delay

        self.logger.info('Crawling timeline: %s', locals())

        tweets = self.timeline(screen_name=screen_name, user_id=user_id, include_rts=include_retweets,
                               exclude_replies=exclude_replies, max_id=start_id, since_id=since_id)
        if len(tweets) <= 0:
            return
        tweet_id = start_id
        for tweet in tweets:
            tweet_id = tweet['id']
            yield Tweet(tweet)

        while len(tweets) > 0:
            sleep(delay)
            tweets = self.timeline(screen_name=screen_name, user_id=user_id, include_rts=include_retweets,
                                   exclude_replies=exclude_replies, max_id=tweet_id - 1, since_id=since_id)
            for tweet in tweets:
                tweet_id = tweet['id']
                yield Tweet(tweet)

    def crawl_likes(self, screen_name: str = None, user_id: str = None,
                    start_id=None, since_id=None, delay: float = None) -> Iterable[Tweet]:
        if delay is None:
            delay = self.delay

        self.logger.info('Crawling likes: %s', locals())

        tweets = self.likes(screen_name=screen_name, user_id=user_id, max_id=start_id, since_id=since_id)
        if len(tweets) <= 0:
            return
        tweet_id = start_id
        for tweet in tweets:
            tweet_id = tweet['id']
            yield Tweet(tweet)

        while len(tweets) > 0:
            sleep(delay)
            tweets = self.likes(screen_name=screen_name, user_id=user_id, max_id=tweet_id - 1, since_id=since_id)
            for tweet in tweets:
                tweet_id = tweet['id']
                yield Tweet(tweet)

    def crawl_following(self, screen_name: str = None, user_id: str = None,
                        include_retweets: bool = True, exclude_replies: bool = True,
                        checkpoint: Checkpoint = None, delay: float = None) -> Iterable[Tweet]:
        if delay is None:
            delay = self.delay
        cursor = checkpoint.cursor
        start = checkpoint is None or checkpoint.start

        self.logger.info('Crawling following: %s', locals())

        users = self.following(screen_name=screen_name, user_id=user_id, cursor=cursor)

        for user in users['users']:
            if not start:
                if checkpoint.user_id is None or user['id'] == checkpoint.user_id:
                    start = True
                    sleep(delay)
                    for tweet in self.crawl_timeline(user_id=user['id'], include_retweets=include_retweets,
                                                     exclude_replies=exclude_replies, start_id=checkpoint.tweet_id,
                                                     delay=delay):
                        yield tweet
                else:
                    continue
            else:
                sleep(delay)
                for tweet in self.crawl_timeline(user_id=user['id'], include_retweets=include_retweets,
                                                 exclude_replies=exclude_replies, delay=delay):
                    yield tweet
        cursor = users['next_cursor']

        while len(users['users']) > 0:
            sleep(delay)
            users = self.following(screen_name=screen_name, user_id=user_id, cursor=cursor)
            for user in users['users']:
                sleep(delay)
                for tweet in self.crawl_timeline(user_id=user['id'], include_retweets=include_retweets,
                                                 exclude_replies=exclude_replies, delay=delay):
                    yield tweet
            cursor = users['next_cursor']

    def _get(self, url, params):
        """
        Access API with requests and return the result with the format of json.
        """
        r = self.session.get(url=url, params=params, headers=self.headers)
        return json.loads(r.text)

    def _url(self, url):
        return urlparse.urljoin(self.base_url, url)

    def timeline(self, user_id: str = None, screen_name: str = None, count: int = 200,
                 exclude_replies: bool = None, include_rts: bool = None, trim_user: bool = None,
                 since_id=None, max_id=None):
        """
        Returns a collection of the most recent Tweets posted by the user indicated
        by the screen_name or user_id parameters.

        User timelines belonging to protected users may only be requested when the
        authenticated user either "owns" the timeline or is an approved follower of the owner.

        The timeline returned is the equivalent of the one seen as a user's profile on Twitter.

        This method can only return up to 3,200 of a user's most recent Tweets. Native retweets
        of other statuses by the user is included in this total, regardless of whether
        include_rts is set to false when requesting this resource.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 900
        Requests / 15-min window (app auth): 1500
        Requests / 24-hour window: 100,000

        Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-show
        for more information.

        :param user_id: The ID of the user for whom to return results.
        :param screen_name: The screen name of the user for whom to return results.
        :param since_id: Returns results with an ID greater than (that is, more recent than) the specified ID.
                    There are limits to the number of Tweets that can be accessed through the API.
                    If the limit of Tweets has occured since the since_id, the since_id will be forced
                    to the oldest ID available.
        :param count: Specifies the number of Tweets to try and retrieve, up to a maximum of 200
                per distinct request. The value of count is best thought of as a limit to the number
                of Tweets to return because suspended or deleted content is removed after the count
                has been applied. We include retweets in the count, even if include_rts is not supplied.
                It is recommended you always send include_rts=1 when using this API method.
        :param max_id: Returns results with an ID less than (that is, older than) or equal to the specified ID.
        :param trim_user: When set to either true , t or 1 , each Tweet returned in a timeline will
                        include a user object including only the status authors numerical ID.
                        Omit this parameter to receive the complete user object.
        :param exclude_replies: This parameter will prevent replies from appearing in the returned timeline.
                                Using exclude_replies with the count parameter will mean you will receive up-to
                                count tweets — this is because the count parameter retrieves that many Tweets before
                                filtering out retweets and replies.
        :param include_rts: When set to false , the timeline will strip any native retweets
                            (though they will still count toward both the maximal length of the timeline
                            and the slice selected by the count parameter).
                            Note: If you're using the trim_user parameter in conjunction with include_rts,
                            the retweets will still contain a full user object.
        :return: List of tweets= objects.
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get timeline: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required.')
        return self._get(self._url('statuses/user_timeline.json'), params)

    def user(self, user_id: str = None, screen_name: str = None, include_entitles: bool = None):
        """
        Returns a variety of information about the user specified by the required user_id
        or screen_name parameter. The author's most recent Tweet will be returned inline when possible.

        You must be following a protected user to be able to see their most recent Tweet.
        If you don't follow a protected user, the user's Tweet will be removed.
        A Tweet will not always be returned in the current_status field.

        Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-show
        for more information.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 900
        Requests / 15-min window (app auth): 900

        :param user_id: The ID of the user for whom to return results.
                        Either an id or screen_name is required for this method.
        :param screen_name: The screen name of the user for whom to return results.
                            Either a id or screen_name is required for this method.
        :param include_entitles: The entities node will not be included when set to false.
        :return: User-object, see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object .
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get user: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required')
        return self._get(self._url('user/show.json'), params)

    def followers(self, user_id: str = None, screen_name: str = None, cursor=None,
                  count: int = 200, skip_status: bool = None, include_user_entitles: bool = None):
        """
        Returns a cursored collection of user objects for users following the specified user.

        At this time, results are ordered with the most recent following first — however, this ordering
        is subject to unannounced change and eventual consistency issues. Results are given in groups
        of 20 users and multiple "pages" of results can be navigated through using the next_cursor value
        in subsequent requests. See Using cursors to navigate collections for more information.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 15
        Requests / 15-min window (app auth): 15

        Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-followers-list
        for more information.

        :param user_id: The ID of the user for whom to return results.
        :param screen_name: The screen name of the user for whom to return results.
        :param cursor: Causes the results to be broken into pages. If no cursor is provided,
                        a value of -1 will be assumed, which is the first "page."
                        The response from the API will include a previous_cursor and next_cursor to allow
                        paging back and forth. See Using cursors to navigate collections for more information.
        :param count: The number of users to return per page, up to a maximum of 200.
        :param skip_status: When set to either true, t or 1 statuses will not be included in the returned user objects.
        :param include_user_entitles: The user object entities node will not be included when set to false.
        :return: {
                    "users": [
                          {user-object},
                          {user-object},
                          {user-object}
                    ],
                    "previous_cursor": 0,
                    "previous_cursor_str": "0",
                    "next_cursor": 1333504313713126852,
                    "next_cursor_str": "1333504313713126852"
                }
                For user-object, see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object .
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get followers: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required')
        return self._get(self._url('followers/list.json'), params)

    def follower_ids(self, user_id: str = None, screen_name: str = None, cursor=None,
                     count: int = 200, skip_status: bool = None, include_user_entitles: bool = None):
        """
        Returns a cursored collection of user IDs for every user following the specified user.

        At this time, results are ordered with the most recent following first — however, this ordering
        is subject to unannounced change and eventual consistency issues. Results are given in groups
        of 20 users and multiple "pages" of results can be navigated through using the next_cursor value
        in subsequent requests. See Using cursors to navigate collections for more information.

        This method is especially powerful when used in conjunction with GET users / lookup,
        a method that allows you to convert user IDs into full user objects in bulk.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 15
        Requests / 15-min window (app auth): 15

        Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-followers-ids
        for more information.

        :param user_id: The ID of the user for whom to return results.
        :param screen_name: The screen name of the user for whom to return results.
        :param cursor: Causes the results to be broken into pages. If no cursor is provided,
                        a value of -1 will be assumed, which is the first "page."
                        The response from the API will include a previous_cursor and next_cursor to allow
                        paging back and forth. See Using cursors to navigate collections for more information.
        :param count: The number of users to return per page, up to a maximum of 200.
        :param skip_status: When set to either true, t or 1 statuses will not be included in the returned user objects.
        :param include_user_entitles: The user object entities node will not be included when set to false.
        :return: {
                    "ids": [],
                    "previous_cursor": 0,
                    "previous_cursor_str": "0",
                    "next_cursor": 1333504313713126852,
                    "next_cursor_str": "1333504313713126852"
                }
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get follower IDs: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required')
        return self._get(self._url('followers/ids.json'), params)

    def following(self, user_id: str = None, screen_name: str = None, cursor=None,
                  count: int = 200, stringify_ids: bool = None):
        """
        Returns a cursored collection of user IDs for every user the specified user is following
        (otherwise known as their "friends").

        At this time, results are ordered with the most recent following first — however, this
        ordering is subject to unannounced change and eventual consistency issues. Results are
        given in groups of 20 users and multiple "pages" of results can be navigated through using
        the next_cursor value in subsequent requests. See Using cursors to navigate collections
        for more information.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 15
        Requests / 15-min window (app auth): 15

        Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-friends-list
        for more information.

        :param user_id: The ID of the user for whom to return results.
        :param screen_name: The screen name of the user for whom to return results.
        :param cursor: Causes the results to be broken into pages. If no cursor is provided,
                        a value of -1 will be assumed, which is the first "page." The response
                        from the API will include a previous_cursor and next_cursor to allow
                        paging back and forth. See Using cursors to navigate collections for
                        more information.
        :param count: The number of users to return per page, up to a maximum of 200. Defaults to 20.
        :param stringify_ids:
        :return: {
                    "users": [
                          {user-object},
                          {user-object},
                          {user-object}
                    ],
                    "previous_cursor": 0,
                    "previous_cursor_str": "0",
                    "next_cursor": 1333504313713126852,
                    "next_cursor_str": "1333504313713126852"
                }
                For user-object, see https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object .
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get followings: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required')
        return self._get(self._url('friends/list.json'), params)

    def following_ids(self, user_id: str = None, screen_name: str = None, cursor=None,
                      count: int = 200, stringify_ids: bool = None):
        """
        Returns a cursored collection of user IDs for every user the specified user is following
        (otherwise known as their "friends").

        At this time, results are ordered with the most recent following first — however, this
        ordering is subject to unannounced change and eventual consistency issues. Results are
        given in groups of 20 users and multiple "pages" of results can be navigated through using
        the next_cursor value in subsequent requests. See Using cursors to navigate collections
        for more information.

        This method is especially powerful when used in conjunction with GET users / lookup,
        a method that allows you to convert user IDs into full user objects in bulk.

        Check https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-friends-ids
        for more information.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 15
        Requests / 15-min window (app auth): 15

        :param user_id: The ID of the user for whom to return results.
        :param screen_name: The screen name of the user for whom to return results.
        :param cursor: Causes the results to be broken into pages. If no cursor is provided,
                        a value of -1 will be assumed, which is the first "page." The response
                        from the API will include a previous_cursor and next_cursor to allow
                        paging back and forth. See Using cursors to navigate collections for
                        more information.
        :param count: The number of users to return per page, up to a maximum of 200. Defaults to 20.
        :param stringify_ids:
        :return: {
                    "ids": [],
                    "previous_cursor": 0,
                    "previous_cursor_str": "0",
                    "next_cursor": 1333504313713126852,
                    "next_cursor_str": "1333504313713126852"
                }
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get following IDs: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required')
        return self._get(self._url('friends/ids.json'), params)

    def likes(self, user_id: str = None, screen_name: str = None, count: int = 200,
              since_id=None, max_id=None, include_entitles: bool = None):
        """
        Note: favorites are now known as likes.

        Returns the 20 most recent Tweets liked by the authenticating or specified user.

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 75
        Requests / 15-min window (app auth): 75

        Check https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-favorites-list
        for more information.

        :param user_id: The ID of the user for whom to return results.
        :param screen_name: The screen name of the user for whom to return results.
        :param count: Specifies the number of records to retrieve. Must be less than or equal to 200; defaults to 20.
                      The value of count is best thought of as a limit to the number of Tweets to return because
                      suspended or deleted content is removed after the count has been applied.
        :param since_id: Returns results with an ID greater than (that is, more recent than) the specified ID.
                         There are limits to the number of Tweets which can be accessed through the API. If the
                         limit of Tweets has occured since the since_id, the since_id will be forced to the
                         oldest ID available.
        :param max_id: Returns results with an ID less than (that is, older than) or equal to the specified ID.
        :param include_entitles: The entities node will be omitted when set to false.
        :return: List of tweet objects.
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get likes: %s', params)
        if user_id is None and screen_name is None:
            raise ValueError('User ID or username is required')
        return self._get(self._url('favorites/list.json'), params)

    def tweet(self, tweet_id: str, trim_user: bool = None,
              include_my_retweet: bool = None, include_entitles: bool = None,
              include_ext_alt_text: bool = None, include_card_uri: bool = None):
        """
        Returns a single Tweet, specified by the id parameter.
        The Tweet's author will also be embedded within the Tweet.

        See GET statuses / lookup for getting Tweets in bulk (up to 100 per call).
        See also Embedded Timelines, Embedded Tweets, and GET statuses/oembed for tools
        to render Tweets according to Display Requirements.

        About Geo

        If there is no geotag for a status, then there will be an empty <geo></geo> or "geo" : {}.
        This can only be populated if the user has used the Geotagging API to send a statuses/update.

        The JSON response mostly uses conventions laid out in GeoJSON. The coordinates that
        Twitter renders are reversed from the GeoJSON specification (GeoJSON specifies a
        longitude then a latitude, whereas Twitter represents it as a latitude then a longitude),
        eg: "geo": { "type":"Point", "coordinates":[37.78029, -122.39697] }

        Response formats: JSON
        Requires authentication? Yes
        Rate limited? Yes
        Requests / 15-min window (user auth): 900
        Requests / 15-min window (app auth): 900

        :param tweet_id: The numerical ID of the desired Tweet.
        :param trim_user: When set to either true , t or 1 , each Tweet returned in a timeline will
                          include a user object including only the status authors numerical ID.
                          Omit this parameter to receive the complete user object.
        :param include_my_retweet: When set to either true , t or 1 , any Tweets returned that have
                                   been retweeted by the authenticating user will include an additional
                                   current_user_retweet node, containing the ID of the source status
                                   for the retweet.
        :param include_entitles: The entities node will not be included when set to false.
        :param include_ext_alt_text: If alt text has been added to any attached media entities, this
                                     parameter will return an ext_alt_text value in the top-level key
                                     for the media entity. If no value has been set, this will be
                                     returned as null.
        :param include_card_uri: When set to either true , t or 1 , the retrieved Tweet will include
                                 a card_uri attribute when there is an ads card attached to the Tweet
                                 and when that card was attached using the card_uri value.
        :return: The tweet object.
        """
        params = locals()
        del (params['self'])
        self.logger.info('Get tweet: %s', params)
        if tweet_id is None:
            raise ValueError('Tweet ID is required')
        return self._get(self._url('statuses/show.json'), params)
示例#6
0
class IMDBSpider:
    def __init__(self):
        self.base_url = 'https://www.imdb.com/'
        self.session = Session(retry=5)
        self.pattern = {
            'title_and_year': re.compile(r'(.+?)\s+\(([0-9]+)\)'),
            'number': re.compile(r'[\d,]+'),
            'space': re.compile(r'\s+')
        }
        self.log = Log.create_logger('IMDBSpider', './imdb.log')

    def __url(self, url):
        return urlparse.urljoin(self.base_url, url)

    def __text(self, text, repl=' '):
        return self.pattern['space'].sub(repl, text).strip()

    @staticmethod
    def __str2int(string: str):
        return int(string.replace(',', ''))

    @staticmethod
    def __percent2float(string: str):
        return float(string.strip('%')) / 100

    def top250(self, start_from=None):
        r = self.session.get(
            urlparse.urljoin(self.base_url, 'chart/top?ref_=nv_mv_250'))
        s = Soup(r.text, 'lxml')
        table = s.find('table')
        tbody = table.find('tbody')
        tr_list = tbody.find_all('tr')
        start = start_from is None
        for tr in tr_list:
            title_col = tr.find('td', class_='titleColumn')
            a = title_col.find('a')
            title = a.text
            link = a.attrs['href']
            if not start:
                if link == start_from:
                    start = True
            if start:
                yield title, link

    def crawl(self, link):
        title, year, rating, num_rating, short_summary, metascore, review_user, review_critic, num_awards, num_video, num_image, story_line, tag_line, mpaa, genre, details, num_review = self._main(
            link)
        summary, synopsis = self._plot(link)
        keywords = self._keyword(link)
        awards = self._awards(link)
        casts = self._cast(link)
        spec = self._tech_spec(link)
        trivia = self._trivia(link)
        quotes = self._quotes(link)
        goofs = self._goofs(link)
        connections = self._connections(link)
        faq = self._faq(link)
        rating_detail = self._user_rating(link)
        companies = self._company_credits(link)
        return {
            key: value
            for key, value in locals().items()
            if key[0] != '_' and key != 'self'
        }

    def _main(self, link):
        r = self.session.get(self.__url(link))
        s = Soup(r.text, 'lxml')

        # Title and year
        title_wrapper = s.find('div', class_='title_wrapper')
        h1 = title_wrapper.find('h1')
        title_and_year = h1.text
        res = self.pattern['title_and_year'].findall(title_and_year)
        title = res[0][0]
        year = self.__str2int(res[0][1])

        # ratings
        rating_wrapper = s.find('div', class_='ratings_wrapper')
        rating_value = rating_wrapper.find('div', class_='ratingValue')
        strong = rating_value.find('strong')
        span = strong.find('span')
        rating = float(self.__text(span.text))
        a = rating_value.find_next('a')
        num_rating = self.__str2int(self.__text(a.text))

        # Short summary
        plot_summary_wrapper = s.find('div', class_='plot_summary_wrapper')
        summary_text = plot_summary_wrapper.find('div', class_='summary_text')
        short_summary = summary_text.text.strip() if summary_text else None

        # metascore
        title_review_bar = plot_summary_wrapper.find('div',
                                                     class_='titleReviewBar')
        if title_review_bar is not None:
            metascore_div = title_review_bar.find('div',
                                                  class_='metacriticScore')
            if metascore_div is not None:
                span = metascore_div.find('span')
                metascore = self.__str2int(span.text)
            else:
                metascore = None
        else:
            metascore = None

        # num of review user and critic
        if title_review_bar is not None:
            reviews_div = title_review_bar.find(
                'div', class_='titleReviewBarItem titleReviewbarItemBorder')
            a_list = reviews_div.find_all('a')
            review_user, review_critic = [
                self.__str2int(self.pattern['number'].search(a.text).group())
                for a in a_list
            ]
        else:
            review_user, review_critic = None, None

        # num of awards
        title_awards_ranks = s.find('div', id='titleAwardsRanks')
        num_awards = []
        strong = title_awards_ranks.find('strong')
        if strong is not None:
            num_awards.append(strong.text.strip())
        span_list = title_awards_ranks.find_all('span', class_='awards-blurb')
        for span in span_list:
            num_awards.append(self.__text(span.text, ' '))

        # num of videos
        title_video_strip = s.find('div', id='titleVideoStrip')
        if title_video_strip is not None:
            see_more = title_video_strip.find(
                'div', class_='combined-see-more see-more')
            a = see_more.find('a')
            num_video = int(self.pattern['number'].search(a.text).group())
        else:
            num_video = 0

        # num of images
        title_image_strip = s.find('div', id='titleImageStrip')
        if title_image_strip is not None:
            see_more = title_image_strip.find(
                'div', class_='combined-see-more see-more')
            if see_more is not None:
                a = see_more.find_all('a')[1]
                num_image = int(self.pattern['number'].search(a.text).group())
            else:
                num_image = 0
        else:
            num_image = 0

        # short story line
        title_story_line = s.find('div', id='titleStoryLine')
        div = title_story_line.find('div', class_='inline canwrap')
        span = div.find('span')
        story_line = span.text.strip()

        # tagline and mpaa
        txt_block_list = title_story_line.find_all('div', class_='txt-block')
        tag_line_div = txt_block_list[0]
        tag_line = self.__text(tag_line_div.contents[2])
        mpaa_div = txt_block_list[1]
        span = mpaa_div.find('span')
        mpaa = self.__text(span.text, ' ')

        # genre
        see_more_list = title_story_line.find_all(
            'div', class_='see-more inline canwrap')
        genre_div = see_more_list[1]
        genre = list(self.__text(a.text, '') for a in genre_div.find_all('a'))

        # details
        title_details = s.find('div', id='titleDetails')
        details = {}
        for txt_block in title_details.find_all('div', class_='txt-block'):
            h4 = txt_block.find('h4', class_='inline')
            if h4 is not None:
                text = self.__text(txt_block.text)
                if text.find('See more') > 0:
                    text = text[text.find(':') +
                                1:text.find('See more')].strip()
                else:
                    text = text[text.find(':') + 1:].strip()
                details[self.__text(h4.text)] = text

        title_user_review = s.find('div', id='titleUserReviewsTeaser')
        div = title_user_review.find('div', class_='yn')
        num_review = self.__str2int(self.pattern['number'].search(
            div.find_next('a').find_next('a').find_next('a').text).group())

        return title, year, rating, num_rating, short_summary, metascore, review_user, review_critic, num_awards, num_video, num_image, story_line, tag_line, mpaa, genre, details, num_review

    def _plot(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'plotsummary'))
        s = Soup(r.text, 'lxml')

        # summary
        h4 = s.find('h4', id='summaries')
        ul = h4.find_next('ul')
        summary = [li.find('p').text for li in ul.find_all('li')]

        # synopsis
        h4 = s.find('h4', id='synopsis')
        ul = h4.find_next('ul')
        synopsis = [li.text for li in ul.find_all('li')]

        return summary, synopsis

    def _keyword(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'keywords'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='keywords_content')
        table = div.find('table')
        tbody = table.find('tbody')
        keywords = []
        for td in tbody.find_all('td'):
            if td is not None:
                a = td.find('a')
                if a is not None:
                    keywords.append(a.text)
        return keywords

    def _awards(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'awards'))
        s = Soup(r.text, 'lxml')
        main = s.find('div', id='main')
        awards = []
        for h3 in main.find_all('h3')[1:]:
            title = self.__text(h3.next)
            year = self.__text(h3.find('a').text)
            awards.append({'title': title, 'year': year})
        return awards

    def _cast(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'fullcredits'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='fullcredits_content')
        casts = {}
        for h4 in div.find_all('h4'):
            category = self.__text(h4.text)
            table = h4.find_next('table')
            cast_list = []
            if 'class' in table.attrs and 'cast_list' in table.attrs['class']:
                for tr in table.find_all('tr'):
                    if 'class' in tr.attrs:
                        a = tr.find_next('td').find_next('td').find('a')
                        name = self.__text(a.text)
                        td = tr.find('td', class_='character')
                        credit = self.__text(td.text)
                        cast_list.append({'name': name, 'credit': credit})
            else:
                tbody = table.find('tbody')
                for tr in tbody.find_all('tr'):
                    td = tr.find('td', class_='name')
                    name = self.__text(td.text) if td is not None else None
                    td = tr.find('td', class_='credit')
                    credit = self.__text(td.text) if td is not None else None
                    cast_list.append({'name': name, 'credit': credit})
            casts[category] = cast_list
        return casts

    def _tech_spec(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'technical'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='technical_content')
        table = div.find('table')
        tbody = table.find('tbody')
        spec = {}
        for tr in tbody.find_all('tr'):
            td = tr.find('td')
            label = self.__text(td.text)
            td = td.find_next('td')
            value = self.__text(td.text)
            spec[label] = value
        return spec

    def _trivia(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'trivia'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='trivia_content')
        trivia = []
        for text_list in div.find_all('div', class_='list'):
            for soda_text in text_list.find_all('div', class_='sodatext'):
                trivia.append(self.__text(soda_text.text))
        return trivia

    def _quotes(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'quotes'))
        s = Soup(r.text, 'lxml')
        quote = []
        div = s.find('div', id='quotes_content')
        for quote_list in div.find_all('div', class_='list'):
            for soda_text in quote_list.find_all('div', class_='sodatext'):
                quote.append(self.__text(soda_text.text))
        return quote

    def _goofs(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'goofs'))
        s = Soup(r.text, 'lxml')
        goofs = []
        div = s.find('div', id='goofs_content')
        for soda_text in div.find_all('div', class_='sodatext'):
            goofs.append(self.__text(soda_text.text))
        return goofs

    def _connections(self, link):
        r = self.session.get(
            urlparse.urljoin(self.__url(link), 'movieconnections'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='connections_content')
        connections = {}
        category = None
        for soda_list in div.find_all('div', class_='list'):
            if 'id' in soda_list.attrs and soda_list.attrs[
                    'id'] == 'no_content':
                return None
            for soda in soda_list.find_all('div', class_='soda'):
                last = soda.find_previous()
                if last.name == 'h4':
                    category = self.__text(last.text)
                    if category not in connections:
                        connections[category] = []
                connections[category].append(self.__text(soda.text))
        return connections

    def _faq(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'faq'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='main')
        faq = []
        IDs = ['faq-no-spoilers', 'faq-spoilers']
        for ID in IDs:
            section = div.find('section', id=ID)
            ul = section.find('ul')
            for li in ul.find_all('li'):
                question_div = li.find('div', class_='faq-question-text')
                if question_div is None:
                    continue
                question = self.__text(question_div.text)
                answer_div = li.find('div', class_='ipl-hideable-container')
                if answer_div is None:
                    continue
                p = answer_div.find('p')
                answer = self.__text(p.text)
                faq.append({'question': question, 'answer': answer})
        return faq

    def _user_rating(self, link):
        r = self.session.get(urlparse.urljoin(self.__url(link), 'ratings'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='main')
        table = div.find('table')
        rating = {str(i): {} for i in range(1, 11)}
        for idx, div in enumerate(table.find_all('div', class_='topAligned')):
            rating[str(10 - idx)]['percent'] = self.__percent2float(
                self.__text(div.text))
        for idx, div in enumerate(
                table.find_all('div', class_='leftAligned')[1:]):
            rating[str(10 - idx)]['count'] = self.__str2int(
                self.__text(div.text))
        return rating

    def _company_credits(self, link):
        r = self.session.get(
            urlparse.urljoin(self.__url(link), 'companycredits'))
        s = Soup(r.text, 'lxml')
        div = s.find('div', id='company_credits_content')
        companies = {}
        for h4 in div.find_all('h4', class_='dataHeaderWithBorder'):
            category = self.__text(h4.text)
            credit = []
            for li in h4.find_next('ul').find_all('li'):
                credit.append(self.__text(li.text))
            companies[category] = credit
        return companies
示例#7
0
class JuheSpider:

    def __init__(self):
        self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        self.session = Session(retry=10, timeout=20)

    def crawl_api(self, total_page=6, start_from=None):
        url = 'https://www.juhe.cn/docs/index/page/{0}'
        start = start_from is None
        for page in range(total_page + 1):
            r = self.session.get(url.format(page))
            s = Soup(r.text, 'lxml')
            ul = s.find('ul', class_='api-list-ul')
            for li in ul.find_all('li', class_='api-list-li'):
                a = li.find('a')
                link = a['href']
                if not start:
                    if link == start_from:
                        start = True
                    continue
                # h2 = a.find('h2', class_='api-name')
                # title = h2.text
                # div = li.find('div', class_='api-price')
                # price = div.text if div is not None else None
                # p = li.find('p', class_='api-marks')
                # desc = p.text if p is not None else None
                yield link, self._crawl_api_item(link)

    def _crawl_api_item(self, link):

        url = 'https://www.juhe.cn{0}'.format(link)
        num = link.split('/')[-1]
        r = self.session.get(url)
        s = Soup(r.text, 'lxml')
        ul = s.find('ul', class_='api-pp')
        temp = []
        for li in ul.find_all('li'):
            div = li.find('div')
            temp.append(div.text)
        api_item = {
            'ID': temp[0],
            'API请求次数': temp[1],
            '收藏量': temp[2]
        }

        api_infos = s.find('div', class_='api-infos')
        h1 = api_infos.find('h1')
        api_item['标题'] = h1.text if h1 is not None else None
        tags = api_infos.find_all('span')
        api_item['标签列表'] = list(span.text for span in tags) if tags is not None else None

        api_des_info = api_infos.find('p', class_='api-des-info')
        desc = api_des_info.text
        api_item['简介'] = desc
        product_content = s.find('div', class_='product-content')
        if product_content is not None:
            product_aways = product_content.find('div', class_='product-aways')
            api_item['功能介绍'] = product_aways.text

        api_url_list = []
        doc_api_area = s.find('div', id='docs-api-area')
        ul = doc_api_area.find('ul')
        li_list = ul.find_all('li')
        for li in li_list:
            a = li.find('a')
            api_title = a.text
            api_url = a['href']
            api_url_list.append({'title': api_title, 'url': api_url + '/' + num})

        price_url = 'https://www.juhe.cn/docs/api/packages/{0}'.format(num)
        r = self.session.get(price_url)
        result = json.loads(r.text)
        html = result['result']['html']
        s = Soup(html, 'lxml')
        ul = s.find('ul')
        li_list = ul.find_all('li')
        prices = []
        for li in li_list:
            price = li['data-price'] if 'data-price' in li.attrs else None
            tag = re.sub(r'\s+', '', li.text)
            if price is not None or tag is not None:
                prices.append({'price': price, 'tag': tag})
        api_item['价格'] = prices

        api_list = []
        for api_url in api_url_list:
            api = {
                'title': api_url['title']
            }
            api_url = 'https://www.juhe.cn{0}'.format(api_url['url'])
            r = self.session.get(api_url)
            result = json.loads(r.text)['result']
            s = Soup(result['html'], 'lxml')
            div_list = s.find_all('div', class_='simpleline')
            for i in range(len(div_list) - 1):
                div = div_list[i]
                tag_and_content = div.text.split(':', 1)
                if len(tag_and_content) > 1:
                    api[tag_and_content[0]] = tag_and_content[1].strip().strip('\n')

            idx = 0
            div_list = s.find_all('div', class_='simpleTable')

            div = div_list[idx]
            p = div.find('p')
            tag = p.text.strip(':')
            if tag == '请求参数说明':
                params = []
                table = div.find('table')
                tr_list = table.find_all('tr')
                header_tr = tr_list[0]
                headers = []
                for th in header_tr.find_all('th')[1:]:
                    headers.append(th.text)
                for tr in tr_list[1:]:
                    param = {}
                    for index, td in enumerate(tr.find_all('td')[1:]):
                        param[headers[index]] = td.text
                    params.append(param)
                api[tag] = params

                idx = min(idx + 1, len(div_list) - 1)

            codes = []
            div = div_list[idx]
            p = div.find('p')
            tag = p.text.strip(':')

            if tag == '请求代码示例':
                for code in div.find_all('code'):
                    language = code['class'][0]
                    content = code.text
                    codes.append({'language': language, 'code': content})
                api[tag] = codes

                idx = min(idx + 1, len(div_list) - 1)

            return_param = []
            div = div_list[idx]
            p = div.find('p')
            tag = p.text.strip(':')
            if tag == '返回参数说明':
                table = div.find('table')
                tr_list = table.find_all('tr')
                header_tr = tr_list[0]
                headers = []
                for th in header_tr.find_all('th')[1:]:
                    headers.append(th.text)
                for tr in tr_list[1:]:
                    param = {}
                    for index, td in enumerate(tr.find_all('td')[1:]):
                        param[headers[index]] = td.text
                    return_param.append(param)
                api[tag] = return_param

                idx = min(idx + 1, len(div_list) - 1)

            div = div_list[idx]
            p = div.find('p')
            tag = p.text.strip(':')
            if tag == 'JSON返回示例':
                return_example = div.find('code').text
                api[tag] = return_example

            api_list.append(api)
        api_item['API'] = api_list

        error_code_url = 'https://www.juhe.cn/docs/api/errorCode/{}'.format(num)
        r = self.session.get(error_code_url)
        result = json.loads(r.text)
        api_item['错误码'] = result['result'] if 'result' in result else None

        return api_item

    def crawl_data(self, start_from=None):
        url = 'https://www.juhe.cn/market'
        start = start_from is None
        r = self.session.get(url)
        s = Soup(r.text, 'lxml')
        ul = s.find('ul', class_='api-list-ul')
        for li in ul.find_all('li', class_='api-list-li'):
            a = li.find('a')
            link = a['href']
            if not start:
                if link == start_from:
                    start = True
                continue
            # h2 = a.find('h2', class_='api-name')
            # title = h2.text
            yield link, self._crawl_data_item(link)

    def _crawl_data_item(self, link):
        r = self.session.get(link)
        s = Soup(r.text, 'lxml')
        block_main_info = s.find('div', class_='block-main-info')

        data_item = {'标题': block_main_info.find('h1').text,
                     '简介': block_main_info.find('p', class_='block-main-desc').text}

        block_main_detail = block_main_info.find('div', class_='block-main-detail')
        dl_list = block_main_detail.find_all('dl')
        for dl in dl_list:
            dt = dl.find('dt')
            dd = dl.find('dd')
            data_item[dt.text] = dd.text

        ul = s.find('ul', class_='block-main-list')
        temp = []
        for li in ul.find_all('li'):
            span = li.find('span')
            temp.append(span.text)
        data_item.update({
            'ID': temp[0],
            '下载次数': temp[1],
            '收藏量': temp[2]
        })

        table = s.find('table', class_='block-table')
        tr_list = table.find_all('tr')
        header_tr = tr_list[0]
        headers = []
        for td in header_tr.find_all('td')[1:]:
            headers.append(td.text)
        data_list = []
        for tr in tr_list[1:]:
            data = {}
            for idx, td in enumerate(tr.find_all('td')[1:]):
                data[headers[idx]] = td.text
            data_list.append(data)
        data_item['数据列表'] = data_list
        return data_item
示例#8
0
class GBDEXSpider:

    def __init__(self):
        self.session = Session(timeout=10)

    def crawl_api(self):
        param = {
            'parentIndustryId': '',
            'industryId': '',
            'cityId': '',
            'areaId': '',
            'dataType': 0,
            'sort': 'updateTime',
            'order': 'desc',
            'keyword': '',
            'page': 1,
            'limit': 1000
        }
        url = 'http://trade.gbdex.com/trade.web/accurateData/filterDataByAjax'
        r = self.session.post(url, params=param)
        json_data = json.loads(r.text)
        link_list = json_data['rows'] if 'rows' in json_data else list()
        print('GDBEX - API: {0} items total in list'.format(len(link_list)))
        tqdm_list = tqdm(link_list)
        for link in tqdm_list:
            tqdm_list.set_description('Crawling: {}'.format(link['id']))
            try:
                yield self._crawl_api_item(link['id'])
            except Timeout:
                continue

    def _crawl_api_item(self, product_id):
        url = 'http://trade.gbdex.com/trade.web/api/apishow'
        param = {'productID': product_id}
        r = self.session.get(url, params=param)
        s = Soup(r.text, 'lxml')
        api = dict()
        api['标题'], api['简介'], api['价格'], api['数据类型'], api['覆盖区域'], api['点击量'], api[
            '收藏量'] = self._parse_brief(s)
        api['API'] = self._parse_list(s)
        api['代码示例'] = self._parse_code_demo(s)
        api['id'] = product_id
        return api

    @staticmethod
    def _parse_brief(s):
        detail_info_r = s.find('div', class_='detailInfo-R')
        datum_title = detail_info_r.find('h3', class_='datumTitle')
        title = datum_title.text if datum_title is not None else None
        data_content_hidden = detail_info_r.find('p', id='dataContentHidden')
        brief = data_content_hidden.text if data_content_hidden is not None else None
        product_price = detail_info_r.find('span', id='productPrice')
        price = re.sub(r'\s+', '', product_price.text) if product_price is not None else None
        product_profile = detail_info_r.find('ul', class_='product-profile clearfix')
        if product_profile is not None:
            li_list = product_profile.find_all('li')
            data_type = li_list[1].text.split(':')[1]
            area = li_list[2].text.split(':')[1]
        else:
            combo_set = detail_info_r.find('div', class_='combo-set comboLeft')
            p_list = combo_set.find_all('p')
            data_type = p_list[0].text.split(':')[1]
            area = p_list[2].text.split(':')[1]
        detail_info_l = s.find('div', class_='detailInfo-l')
        target = detail_info_l.find('div', class_='target')
        span_list = target.find_all('span')
        click_count = int(re.sub(r'\s+', '', span_list[0].text))
        fav_count = int(re.sub(r'\s+', '', span_list[2].text))

        return title, brief, price, data_type, area, click_count, fav_count

    @staticmethod
    def _parse_list(s):
        api_title = []
        div = s.find('div', id='definite0')
        section = div.find('section', class_='clearfix base-boder')
        base_info = section.find('div', class_='baseInfo')
        ul = base_info.find(id='navUl')
        for li in ul.find_all('li'):
            api_title.append(li['title'])

        api_list = []

        for idx, page in enumerate(s.find_all('div', class_='baseContent fl')):
            api = {}
            table_list = page.find_all('table')
            if len(table_list) <= 0:
                continue
            table = table_list[0]
            tr_list = table.find_all('tr')
            l = []
            for tr in tr_list:
                td = tr.find_all('td')[1]
                l.append(re.sub(r'\s+', '', td.text))
            api['接口地址'], api['请求类型'], api['返回数据格式'], \
            api['数据总量'], api['是否收费'] = l[0], l[1], l[2], l[3], l[4]
            table = table_list[1]
            t_body = table.find('tbody')
            tr_list = t_body.find_all('tr')
            api['输入字段'] = []
            for tr in tr_list:
                td_list = tr.find_all('td')
                input_data = {'描述': td_list[0].text, '参数名称': td_list[1].text, '数据类型': td_list[2].text,
                              '默认值': td_list[3].text, '是否必填': td_list[4].text}
                api['输入字段'].append(input_data)

            table = table_list[2]
            t_body = table.find('tbody')
            tr_list = t_body.find_all('tr')
            api['返回字段'] = []
            for tr in tr_list:
                td_list = tr.find_all('td')
                return_data = {'描述': td_list[0].text, '参数名称': td_list[1].text, '数据类型': td_list[2].text,
                               '默认值': td_list[3].text, '是否必填': td_list[4].text}
                api['返回字段'].append(return_data)

            api['API名称'] = api_title[idx]
            api_list.append(api)

        return api_list

    @staticmethod
    def _parse_code_demo(s):
        code_demo = {}
        div = s.find('div', id='definite2')
        ul = div.find('ul', id='tab-dm')
        if ul is None:
            return None
        li_list = ul.find_all('li')
        if li_list is None:
            return None
        code_type_list = list(li.text for li in li_list)
        sample_div = div.find('div', class_='definiteContent sample')
        if sample_div is None:
            return None
        code_span_list = sample_div.find_all('span')
        if code_span_list is None:
            return None
        if len(code_type_list) != len(code_span_list):
            print('Count of type ({0}) is not eqaul to count of code span ({1})'.format(len(code_type_list),
                                                                                        len(code_span_list)))
        for idx, code in enumerate(code_span_list):
            code_demo[code_type_list[idx]] = code.text
        return code_demo

    def crawl_data_file(self):
        param = {
            'parentIndustryId': '',
            'industryId': '',
            'cityId': '',
            'areaId': '',
            'dataType': 1,
            'sort': 'updateTime',
            'order': 'desc',
            'keyword': '',
            'page': 1,
            'limit': 3000
        }
        url = 'http://trade.gbdex.com/trade.web/accurateData/filterDataByAjax'
        r = requests.post(url, params=param, timeout=10)
        json_data = json.loads(r.text)
        link_list = json_data['rows'] if 'rows' in json_data else list()
        print('GDBEX - Data File: {0} items total in list'.format(len(link_list)))
        tqdm_list = tqdm(link_list)
        for link in tqdm_list:
            tqdm_list.set_description('Crawling: {}'.format(link['id']))
            try:
                yield self._crawl_data_file_item(link['id'])
            except Timeout:
                continue

    def _crawl_data_file_item(self, product_id):
        url = 'http://trade.gbdex.com/trade.web/dataReport/reportPayForProduct'
        param = {'productID': product_id}
        r = requests.get(url, params=param, timeout=10)
        s = Soup(r.text, 'lxml')
        item = dict()
        item['标题'], item['简介'], item['价格'], item['数据类型'], item['覆盖区域'], item['点击量'], item[
            '收藏量'] = self._parse_brief(s)
        item['详细信息'] = self._parse_file_info(s)
        item['ID'] = product_id
        return item

    @staticmethod
    def _parse_file_info(s):
        table = s.find('table')
        info = {}
        for tr in table.find_all('tr'):
            td_list = tr.find_all('td')
            info[td_list[0].text] = re.sub(r'\s+', '', td_list[1].text)
        return info