예제 #1
0
 def edge_followed_by(self, response: HtmlResponse, username, user_id,
                      config_path, config_hash, variables):
     j_data = json.loads(response.text)
     page_info = j_data.get('data').get('user').get(config_path).get(
         'page_info')
     if page_info.get('has_next_page'):  #Если есть следующая страница
         variables['after'] = page_info[
             'end_cursor']  #Новый параметр для перехода на след. страницу
         url_list = f'{self.graphql_url}query_hash={config_hash}&{urlencode(variables)}'
         yield response.follow(url_list,
                               callback=self.edge_followed_by,
                               cb_kwargs={
                                   'username': username,
                                   'user_id': user_id,
                                   'config_path': deepcopy(config_path),
                                   'config_hash': deepcopy(config_hash),
                                   'variables': deepcopy(variables)
                               })
     users = j_data.get('data').get('user').get(config_path).get('edges')
     for user in users:
         item = InstagramItem(
             user_id=user_id,
             utype=config_path,
             uid=user['node']['id'],
             username=user['node']['username'],
             photo=user['node']['profile_pic_url'],
         )
         yield item
예제 #2
0
    def subscribers_parse(self, response, user_id, username, variables):
        j_body = json.loads(response.text)
        page_info = j_body.get('data').get('user').get('edge_followed_by').get(
            'page_info')
        if page_info['has_next_page']:
            variables['after'] = page_info['end_cursor']

            url_subscribers = f'{self.graphql_url}query_hash={self.subscribers_hash}&{urlencode(variables)}'

            yield response.follow(url_subscribers,
                                  callback=self.subscribers_parse,
                                  cb_kwargs={
                                      'user_id': user_id,
                                      'username': username,
                                      'variables': deepcopy(variables)
                                  })

        subscribers = j_body.get('data').get('user').get(
            'edge_followed_by').get('edges')
        for subscriber in subscribers:
            item = InstagramItem(
                source_id=user_id,
                source_name=username,
                user_id=subscriber['node']['id'],
                user_name=subscriber['node']['username'],
                user_fullname=subscriber['node']['full_name'],
                photo_url=subscriber['node']['profile_pic_url'],
                subs_type='subscriber')

            yield item
    def followers_parse(self, response, user_id, variables):
        j_body = json.loads(response.text)
        page_info = j_body.get('data').get('user').get('edge_followed_by').get('page_info')
        if page_info['has_next_page']:
            variables['after'] = page_info['end_cursor']

            url_followers = f'{self.graphql_link}query_hash={self.hash_followers}&{urlencode(variables)}'

            yield response.follow(
                url_followers,
                callback=self.followers_parse,
                cb_kwargs={'user_id': user_id,
                           'variables': deepcopy(variables)}
            )

        followers = j_body.get('data').get('user').get('edge_followed_by').get('edges')
        for follower in followers:
            item = InstagramItem(
                followers_of=user_id,
                name=self.parser_user,
                id=follower['node']['id'],
                username=follower['node']['username'],
                fullname=follower['node']['full_name'],
                profile_pic=follower['node']['profile_pic_url'],
                status='followers'
            )

            yield item
 def user_relations_parse(self, response: HtmlResponse, parsed_username, parsed_user_id, relation, variables):
     # Принимаем ответ. Не забываем про параметры от cb_kwargs
     j_data = json.loads(response.text)
     page_info = j_data['data']['user'][self.user_relations[relation]['json_group_name']]['page_info']
     # Если есть следующая страница сохраняем курсор для перехода на следующую
     # и запускаем обработку этим же методом
     if page_info.get('has_next_page'):
         variables['after'] = page_info['end_cursor']
         url_posts = f'{self.graphql_url}query_hash={self.user_relations[relation]["hash"]}&{urlencode(variables)}'
         yield response.follow(
             url_posts,
             callback=self.user_relations_parse,
             cb_kwargs={'parsed_username': parsed_username,
                        'parsed_user_id': parsed_user_id,
                        'relation': relation,                # дополнительно передаем тип отношений
                        'variables': deepcopy(variables)}
         )
     # Обрабатывваем список, полученный в ответе и формируем элемент для скрапи
     users_list = j_data['data']['user'][self.user_relations[relation]['json_group_name']]['edges']
     # Перебираем пользователей из ответа, собираем данные
     for user in users_list:
         item = InstagramItem(
             parsed_username=parsed_username,
             parsed_user_id=parsed_user_id,
             relation=relation,
             relation_user_id=user['node']['id'],
             relation_username=user['node']['username'],
             relation_user_pic=user['node']['profile_pic_url'],
             relation_user_all_data=user['node'],
         )
         yield item
예제 #5
0
    def parse_items(self, url):
        try:
            matched = re.match('(https://www.instagram.com/)(\\w+)(/)', url)
            if matched:
                user = matched.groups()[1]
                self.driver.get(url)
                # try:
                #     name = self.driver.find_element_by_xpath('.//div[@class="_tb97a"]/h1').text
                # except NoSuchElementException:
                #     name = ''
                num_posts = int(
                    self.driver.find_elements_by_class_name("g47SY")
                    [0].text.replace(",", ""))
                num_posts = min(num_posts, self.max_count)
                loaded_links = set()
                tries = 1
                while num_posts > 0:
                    links = self.driver.find_elements_by_xpath(
                        ".//div[@class='v1Nh3 kIKUG  _bz0w']/a")
                    # check for first post, if already scraped
                    if self.is_already_scraped(links[0], user):
                        break
                    links = set(links) - loaded_links
                    if len(links - loaded_links) == 0:
                        if tries > 3:
                            break
                        tries += 1

                    loaded_links.update(links)
                    for link_obj in links:
                        link = link_obj.get_attribute('href')
                        matched = re.match('https://www.instagram.com/p/\\w+/',
                                           link)
                        if matched:
                            num_posts -= 1
                            posted_at = self.get_posted_at_time(link_obj)
                            if posted_at > self.loaded.get(user, ''):
                                item = InstagramItem()
                                item['user'] = user
                                item['link'] = matched.group()
                                item["posted_at"] = posted_at
                                item['score'] = 10
                                yield item

                    scroll_to_end(self.driver)
                    time.sleep(tries)
        except StaleElementReferenceException:
            print "StaleElementReferenceException. url::", url
        except NoSuchElementException:
            print "NoSuchElementException. url::", url
        except Exception:
            print "Exception. url::", url
    def parse_photo(self, response):

        self.driver.get(response.url)
        #find the 'Load More' button in the 'comments' section and load all of them
        try:
            while True:
                self.driver.find_element_by_xpath(
                    '//button[@class="_l086v _ifrvy"]').click()
        except:
            pass
        #All comments have been loaded, once again pass the "body" argument back in
        response1 = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')

        li_class = response1.xpath("//li[@class='_nk46a']")

        data = InstagramItem()
        data['href'] = response1.url
        data['username'] = response1.xpath(
            ".//header/div//a[1]/@title").extract()
        data['username_href'] = response1.xpath(
            ".//header//div/a[1]/@href").extract()
        data['location'] = response1.xpath(
            ".//header//div//a[2]/@title").extract()
        data['location_href'] = response1.xpath(
            ".//header//div//a[2]/@href").extract()
        data['likes'] = response1.xpath(
            ".//span[@class='_tf9x3']/span/text()").extract()
        data['time'] = response1.xpath(
            ".//a[@class='_rmo1e']/time/@datetime").extract()
        data['comments'] = defaultdict()
        for i in li_class:
            try:
                data['comments'][str(
                    i.xpath(".//a/@title").extract())] = i.xpath(
                        ".//span//text()").extract()
            except:
                pass
        yield data
예제 #7
0
    def realRealParse(self, response):
        try:
            js = response.selector.xpath(
                '//script[contains(., "window._sharedData")]/text()').extract(
                )
            js = js[0].replace("window._sharedData = ", "")
            jscleaned = js[:-1]

            locations = json.loads(jscleaned)
            jsonResult = locations['entry_data']['TagPage'][0]['graphql'][
                'hashtag']['edge_hashtag_to_media']
        except:
            locations = json.loads(response.body_as_unicode())
            jsonResult = locations['data']['hashtag']['edge_hashtag_to_media']

        for edge in jsonResult['edges']:
            item = InstagramItem()
            try:
                item['text'] = edge['node']['edge_media_to_caption']['edges'][
                    0]['node']['text']
            except:
                item['text'] = ''

            timestamp = edge['node']['taken_at_timestamp']
            item['date'] = dt.fromtimestamp(timestamp).strftime(
                '%Y-%m-%d %H:%M:%S')

            shortcode = edge['node']['shortcode']
            item['shortcode'] = shortcode
            # item['each_url'] = 'https://www.instagram.com/graphql/query/?query_hash=477b65a610463740ccdb83135b2014db&variables={"shortcode":"' + shortcode + '"}'

            # shortcode = edge['node']['shortcode']
            item['each_url'] = 'https://www.instagram.com/graphql/query/?query_hash=477b65a610463740ccdb83135b2014db&variables={"shortcode":"' + edge[
                'node'][
                    'shortcode'] + '","child_comment_count":3,"fetch_comment_count":40,"parent_comment_count":24,"has_threaded_comments":true}'

            ###숫자 형식으로 가져옴
            item['like_count'] = edge['node']['edge_liked_by']['count']

            if edge['node']["is_video"]:
                item["video_view_count"] = edge['node']["video_view_count"]
                item['explain'] = 'Video'
            else:
                item["video_view_count"] = -1
                item['explain'] = edge['node']['accessibility_caption']

            yield item

            # yield SplashRequest(each_url, self.realRealRealParse,
            #     headers= {
            #         # 'cookie': 'mid=XUPd8wABAAEAhfjAaBu9HIsUHK_g; shbid=3399; ig_direct_region_hint=PRN; csrftoken=nV4vyZc6g0cMk3M3O1kDeuwqUczWQCsL; ds_user_id=2905070466; sessionid=2905070466%3AGCFsB0dJqmQ5iM%3A27; shbts=1567053757.062416; rur=VLL; urlgen="{\"223.38.23.110\": 9644\054 \"222.107.238.125\": 4766}:1i3X5Y:mYd-4kQGJ7dlVJWfLs7pQvktegQ"',
            #         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.89 Whale/1.6.81.8 Safari/537.36'
            #     },
            #     endpoint = 'render.html',
            #     args={'wait': 2.5}
            # )

        hasNext = jsonResult['page_info']['has_next_page']
        afterKey = jsonResult['page_info']['end_cursor']
        if hasNext is not None:
            # self.first += 1
            params = {
                'tag_name': self.keyword,
                'first': self.first,
                'after': afterKey
            }

            # testUrl = 'https://www.instagram.com/graphql/query/?query_hash={}&variables={}'.format(response.meta['queryKey'], urllib.parse.quote_plus(str(params)))
            testUrl = 'https://www.instagram.com/graphql/query/?query_hash={}&variables={}'.format(
                self.hashKey, urllib.parse.quote_plus(str(params)))
            testUrl = re.sub(r"\+", "", testUrl)
            testUrl = re.sub(r'%27', '\"', testUrl)

            yield response.follow(
                url=testUrl,
                callback=self.realRealParse,
                meta={
                    'queryKey': self.hashKey  #response.meta['queryKey']
                })
예제 #8
0
    def realRealRealParse(self, response):
        print("*" * 100)
        print("*" * 100)
        print("*" * 100)
        print(json.loads(response.css("pre::text").get()))
        post = json.loads(
            response.css("pre::text").get())['data']['shortcode_media']
        item = InstagramItem()
        try:
            item['text'] = post['edge_media_to_caption']['edges'][0]['node'][
                'text']
        except:
            item['text'] = ''

        timestamp = post['taken_at_timestamp']
        item['date'] = dt.fromtimestamp(timestamp).strftime(
            '%Y-%m-%d %H:%M:%S')

        shortcode = post['shortcode']
        item['shortcode'] = shortcode
        item['each_url'] = response.url

        item['like_count'] = post['edge_media_preview_like']['count']

        if post['location']:
            locDic = {}
            locDic[post['location']['id']] = post['location']['name']
            item['location'] = locDic
            item['address_json'] = post['location']['address_json']
        else:
            item['location'] = ''
            item['address_json'] = ''

        mediaList = []
        if post['__typename'] == 'GraphSidecar':
            for edge in post['edge_sidecar_to_children']['edges']:
                edgeDic = {}
                is_video = edge['node']['is_video']
                edgeDic['is_video'] = is_video
                edgeDic['media_url'] = edge['node']['display_url']
                if is_video:
                    edgeDic["video_view_count"] = edge['node'][
                        "video_view_count"]
                    edgeDic['explain'] = 'Video'
                else:
                    edgeDic["video_view_count"] = -1
                    edgeDic['explain'] = edge['node']['accessibility_caption']
                mediaList.append(edgeDic)
        else:
            mediaList = []
            edgeDic = {}
            is_video = post['is_video']
            edgeDic['is_video'] = is_video
            edgeDic['media_url'] = post['display_url']
            if is_video:
                edgeDic["video_view_count"] = post["video_view_count"]
                edgeDic['explain'] = 'Video'
            else:
                edgeDic["video_view_count"] = -1
                edgeDic['explain'] = post['accessibility_caption']
            mediaList.append(edgeDic)
        item['mediaList'] = mediaList

        yield item