示例#1
0
class ArtistSpider(scrapy.Spider):
    name = 'artist'
    start_urls = ['https://soundcloud.com/']

    def __init__(self):
        self.config = util.load_config()
        util.register_gcp_credential(self.config)
        self.target_ids = util.load_target_ids('target.txt')
        self.dbhandler = DBHandler(self.config)
        self.gcphandler = GCPHandler(self.config)

    def parse(self, response):
        for target_id in self.target_ids:
            artist_url = f'https://soundcloud.com/{target_id}'
            yield scrapy.Request(artist_url, self.parse_userpage)

    def parse_userpage(self, response):
        userpage_soup = BeautifulSoup(response.body, 'lxml')
        script = str(userpage_soup.find_all('script')[-1])
        script = script.split('"data":[')[-1].replace(']}]);</script>', '')
        user_info = json.loads(script)
        user_profile_link = user_info['avatar_url']
        user_profile_link = user_profile_link.replace(
            'large', 't500x500') if user_profile_link else ''
        user_banner_link = user_info['visuals']
        user_banner_link = user_banner_link['visuals'][0][
            'visual_url'] if user_banner_link else ''
        user_json = {
            "user_id":
            user_info['permalink'],
            "user_sid":
            user_info['id'],
            "user_name":
            user_info['username'] if user_info['username'] else '',
            "user_full_name":
            user_info['full_name'] if user_info['full_name'] else '',
            "user_description":
            user_info['description'] if user_info['description'] else '',
            "user_country":
            user_info['country_code'] if user_info['country_code'] else '',
            "user_city":
            user_info['city'] if user_info['city'] else '',
            "user_type":
            0
        }
        self.dbhandler.insert_user(user_json)
        if user_profile_link:
            user_profile_req = scrapy.Request(user_profile_link,
                                              self.parse_profile_img)
            user_profile_req.meta['user_json'] = user_json
            yield user_profile_req
        if user_banner_link:
            user_banner_req = scrapy.Request(user_banner_link,
                                             self.parse_banner_img)
            user_banner_req.meta['user_json'] = user_json
            yield user_banner_req

    def parse_profile_img(self, response):
        user_json = response.meta['user_json']
        image = Image.open(io.BytesIO(response.body))
        profile_name = f'{user_json["user_id"]}_profile_500x500.jpg'
        profile_thumnail_name = f'{user_json["user_id"]}_profile_128x128.jpg'
        image.save(f'./tmp/{profile_name}')
        image = image.resize((128, 128))
        image.save(f'./tmp/{profile_thumnail_name}')
        profile_url = self.gcphandler.upload_file(
            f'./tmp/{profile_name}',
            f'users/profiles/org/{user_json["user_id"]}.jpg')
        profile_thumbnail_url = self.gcphandler.upload_file(
            f'./tmp/{profile_thumnail_name}',
            f'users/profiles/thumbnail/{user_json["user_id"]}.jpg')
        user_json["user_profile_org"] = profile_url
        user_json["user_profile_thumbnail"] = profile_thumbnail_url
        os.remove(f'./tmp/{profile_name}')
        os.remove(f'./tmp/{profile_thumnail_name}')
        self.dbhandler.update_user_profile(user_json)

    def parse_banner_img(self, response):
        user_json = response.meta['user_json']
        image = Image.open(io.BytesIO(response.body))
        banner_name = f'{user_json["user_id"]}_banner.jpg'
        image.save(f'./tmp/{banner_name}')
        banner_url = self.gcphandler.upload_file(
            f'./tmp/{banner_name}',
            f'users/banners/{user_json["user_id"]}.jpg')
        user_json["user_banner"] = banner_url
        self.dbhandler.update_user_banner(user_json)
        os.remove(f'./tmp/{banner_name}')
示例#2
0
class CommentsSpider(scrapy.Spider):
    name = 'comment'
    start_urls = ['https://soundcloud.com/']

    def __init__(self):
        self.config = util.load_config()
        util.register_gcp_credential(self.config)
        self.target_ids = util.load_target_ids('target.txt')
        self.dbhandler = DBHandler(self.config)
        self.gcphandler = GCPHandler(self.config)
        warnings.filterwarnings(action='ignore')

    def parse(self, response):
        for target_id in self.target_ids:
            track_infos = self.dbhandler.select_track_ids(target_id)
            for track_info in track_infos:
                track_id = track_info[0]
                url = f'https://api-v2.soundcloud.com/tracks/{track_id}/comments?filter_replies=0&threaded=1&client_id={self.config["CLIENT_ID"]}&offset=0&limit=20&app_version=1595511948&app_locale=en'
                req = scrapy.Request(url, self.parse_comment)
                req.meta['cnt'] = 0
                req.meta['user_id'] = target_id
                req.meta['track_id'] = track_id
                yield req

    def parse_comment(self, response):
        cnt = response.meta['cnt']
        if cnt > 5:
            return
        user_id = response.meta['user_id']
        track_id = response.meta['track_id']
        comment_json = json.loads(response.body)
        collections = comment_json['collection']
        next_href = comment_json['next_href']
        comments = []
        for collection in collections:
            comments.append({
                "created_at": self.trim_created_at(collection['created_at']),
                "comment_user_id": user_id,
                "comment_uploader_id": collection['user']['permalink'],
                "comment_track_id": track_id,
                "comment_body": collection['body'] if collection['body'] else '',
            })
            user_profile_link = collection['user']['avatar_url']
            user_profile_link = user_profile_link.replace('large', 't500x500') if user_profile_link else ''
            user_json = {
                "user_id": collection['user']['permalink'],
                "user_sid": collection['user']['id'],
                "user_name": collection['user']['username'] if collection['user']['username'] else '',
                "user_full_name": collection['user']['full_name'] if collection['user']['full_name'] else '',
                "user_description": '',
                "user_country": collection['user']['country_code'] if collection['user']['country_code'] else '',
                "user_city": collection['user']['city'] if collection['user']['city'] else '',
                "user_type": 1
            }
            self.dbhandler.insert_user(user_json)
            if user_profile_link:
                user_profile_req = scrapy.Request(user_profile_link, self.parse_profile_img)
                user_profile_req.meta['user_json'] = user_json
                yield user_profile_req
        self.dbhandler.insert_comments(comments)
        if next_href:
            req = scrapy.Request(url=next_href, callback=self.parse_comment)
            req.meta['cnt'] = cnt + 1
            req.meta['user_id'] = user_id
            req.meta['track_id'] = track_id
            yield req

    def parse_profile_img(self, response):
        user_json = response.meta['user_json']
        image = Image.open(io.BytesIO(response.body))
        profile_name = f'{user_json["user_id"]}_profile_500x500.jpg'
        profile_thumnail_name = f'{user_json["user_id"]}_profile_128x128.jpg'
        image.save(f'./tmp/{profile_name}')
        image = image.resize((128, 128))
        image.save(f'./tmp/{profile_thumnail_name}')
        profile_url = self.gcphandler.upload_file(f'./tmp/{profile_name}', f'users/profiles/org/{user_json["user_id"]}.jpg')
        profile_thumbnail_url = self.gcphandler.upload_file(f'./tmp/{profile_thumnail_name}', f'users/profiles/thumbnail/{user_json["user_id"]}.jpg')
        user_json["user_profile_org"] = profile_url
        user_json["user_profile_thumbnail"] = profile_thumbnail_url
        os.remove(f'./tmp/{profile_name}')
        os.remove(f'./tmp/{profile_thumnail_name}')
        self.dbhandler.update_user_profile(user_json)

    def trim_created_at(self, created_at_str):
        created_at_str = created_at_str.replace('T', ' ')
        created_at_str = created_at_str.replace('Z', '')
        return created_at_str