def edge_followed_by(self, response: HtmlResponse, username, user_id, config_path, config_hash, variables): j_data = json.loads(response.text) page_info = j_data.get('data').get('user').get(config_path).get( 'page_info') if page_info.get('has_next_page'): #Если есть следующая страница variables['after'] = page_info[ 'end_cursor'] #Новый параметр для перехода на след. страницу url_list = f'{self.graphql_url}query_hash={config_hash}&{urlencode(variables)}' yield response.follow(url_list, callback=self.edge_followed_by, cb_kwargs={ 'username': username, 'user_id': user_id, 'config_path': deepcopy(config_path), 'config_hash': deepcopy(config_hash), 'variables': deepcopy(variables) }) users = j_data.get('data').get('user').get(config_path).get('edges') for user in users: item = InstagramItem( user_id=user_id, utype=config_path, uid=user['node']['id'], username=user['node']['username'], photo=user['node']['profile_pic_url'], ) yield item
def subscribers_parse(self, response, user_id, username, variables): j_body = json.loads(response.text) page_info = j_body.get('data').get('user').get('edge_followed_by').get( 'page_info') if page_info['has_next_page']: variables['after'] = page_info['end_cursor'] url_subscribers = f'{self.graphql_url}query_hash={self.subscribers_hash}&{urlencode(variables)}' yield response.follow(url_subscribers, callback=self.subscribers_parse, cb_kwargs={ 'user_id': user_id, 'username': username, 'variables': deepcopy(variables) }) subscribers = j_body.get('data').get('user').get( 'edge_followed_by').get('edges') for subscriber in subscribers: item = InstagramItem( source_id=user_id, source_name=username, user_id=subscriber['node']['id'], user_name=subscriber['node']['username'], user_fullname=subscriber['node']['full_name'], photo_url=subscriber['node']['profile_pic_url'], subs_type='subscriber') yield item
def followers_parse(self, response, user_id, variables): j_body = json.loads(response.text) page_info = j_body.get('data').get('user').get('edge_followed_by').get('page_info') if page_info['has_next_page']: variables['after'] = page_info['end_cursor'] url_followers = f'{self.graphql_link}query_hash={self.hash_followers}&{urlencode(variables)}' yield response.follow( url_followers, callback=self.followers_parse, cb_kwargs={'user_id': user_id, 'variables': deepcopy(variables)} ) followers = j_body.get('data').get('user').get('edge_followed_by').get('edges') for follower in followers: item = InstagramItem( followers_of=user_id, name=self.parser_user, id=follower['node']['id'], username=follower['node']['username'], fullname=follower['node']['full_name'], profile_pic=follower['node']['profile_pic_url'], status='followers' ) yield item
def user_relations_parse(self, response: HtmlResponse, parsed_username, parsed_user_id, relation, variables): # Принимаем ответ. Не забываем про параметры от cb_kwargs j_data = json.loads(response.text) page_info = j_data['data']['user'][self.user_relations[relation]['json_group_name']]['page_info'] # Если есть следующая страница сохраняем курсор для перехода на следующую # и запускаем обработку этим же методом if page_info.get('has_next_page'): variables['after'] = page_info['end_cursor'] url_posts = f'{self.graphql_url}query_hash={self.user_relations[relation]["hash"]}&{urlencode(variables)}' yield response.follow( url_posts, callback=self.user_relations_parse, cb_kwargs={'parsed_username': parsed_username, 'parsed_user_id': parsed_user_id, 'relation': relation, # дополнительно передаем тип отношений 'variables': deepcopy(variables)} ) # Обрабатывваем список, полученный в ответе и формируем элемент для скрапи users_list = j_data['data']['user'][self.user_relations[relation]['json_group_name']]['edges'] # Перебираем пользователей из ответа, собираем данные for user in users_list: item = InstagramItem( parsed_username=parsed_username, parsed_user_id=parsed_user_id, relation=relation, relation_user_id=user['node']['id'], relation_username=user['node']['username'], relation_user_pic=user['node']['profile_pic_url'], relation_user_all_data=user['node'], ) yield item
def parse_items(self, url): try: matched = re.match('(https://www.instagram.com/)(\\w+)(/)', url) if matched: user = matched.groups()[1] self.driver.get(url) # try: # name = self.driver.find_element_by_xpath('.//div[@class="_tb97a"]/h1').text # except NoSuchElementException: # name = '' num_posts = int( self.driver.find_elements_by_class_name("g47SY") [0].text.replace(",", "")) num_posts = min(num_posts, self.max_count) loaded_links = set() tries = 1 while num_posts > 0: links = self.driver.find_elements_by_xpath( ".//div[@class='v1Nh3 kIKUG _bz0w']/a") # check for first post, if already scraped if self.is_already_scraped(links[0], user): break links = set(links) - loaded_links if len(links - loaded_links) == 0: if tries > 3: break tries += 1 loaded_links.update(links) for link_obj in links: link = link_obj.get_attribute('href') matched = re.match('https://www.instagram.com/p/\\w+/', link) if matched: num_posts -= 1 posted_at = self.get_posted_at_time(link_obj) if posted_at > self.loaded.get(user, ''): item = InstagramItem() item['user'] = user item['link'] = matched.group() item["posted_at"] = posted_at item['score'] = 10 yield item scroll_to_end(self.driver) time.sleep(tries) except StaleElementReferenceException: print "StaleElementReferenceException. url::", url except NoSuchElementException: print "NoSuchElementException. url::", url except Exception: print "Exception. url::", url
def parse_photo(self, response): self.driver.get(response.url) #find the 'Load More' button in the 'comments' section and load all of them try: while True: self.driver.find_element_by_xpath( '//button[@class="_l086v _ifrvy"]').click() except: pass #All comments have been loaded, once again pass the "body" argument back in response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') li_class = response1.xpath("//li[@class='_nk46a']") data = InstagramItem() data['href'] = response1.url data['username'] = response1.xpath( ".//header/div//a[1]/@title").extract() data['username_href'] = response1.xpath( ".//header//div/a[1]/@href").extract() data['location'] = response1.xpath( ".//header//div//a[2]/@title").extract() data['location_href'] = response1.xpath( ".//header//div//a[2]/@href").extract() data['likes'] = response1.xpath( ".//span[@class='_tf9x3']/span/text()").extract() data['time'] = response1.xpath( ".//a[@class='_rmo1e']/time/@datetime").extract() data['comments'] = defaultdict() for i in li_class: try: data['comments'][str( i.xpath(".//a/@title").extract())] = i.xpath( ".//span//text()").extract() except: pass yield data
def realRealParse(self, response): try: js = response.selector.xpath( '//script[contains(., "window._sharedData")]/text()').extract( ) js = js[0].replace("window._sharedData = ", "") jscleaned = js[:-1] locations = json.loads(jscleaned) jsonResult = locations['entry_data']['TagPage'][0]['graphql'][ 'hashtag']['edge_hashtag_to_media'] except: locations = json.loads(response.body_as_unicode()) jsonResult = locations['data']['hashtag']['edge_hashtag_to_media'] for edge in jsonResult['edges']: item = InstagramItem() try: item['text'] = edge['node']['edge_media_to_caption']['edges'][ 0]['node']['text'] except: item['text'] = '' timestamp = edge['node']['taken_at_timestamp'] item['date'] = dt.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S') shortcode = edge['node']['shortcode'] item['shortcode'] = shortcode # item['each_url'] = 'https://www.instagram.com/graphql/query/?query_hash=477b65a610463740ccdb83135b2014db&variables={"shortcode":"' + shortcode + '"}' # shortcode = edge['node']['shortcode'] item['each_url'] = 'https://www.instagram.com/graphql/query/?query_hash=477b65a610463740ccdb83135b2014db&variables={"shortcode":"' + edge[ 'node'][ 'shortcode'] + '","child_comment_count":3,"fetch_comment_count":40,"parent_comment_count":24,"has_threaded_comments":true}' ###숫자 형식으로 가져옴 item['like_count'] = edge['node']['edge_liked_by']['count'] if edge['node']["is_video"]: item["video_view_count"] = edge['node']["video_view_count"] item['explain'] = 'Video' else: item["video_view_count"] = -1 item['explain'] = edge['node']['accessibility_caption'] yield item # yield SplashRequest(each_url, self.realRealRealParse, # headers= { # # 'cookie': 'mid=XUPd8wABAAEAhfjAaBu9HIsUHK_g; shbid=3399; ig_direct_region_hint=PRN; csrftoken=nV4vyZc6g0cMk3M3O1kDeuwqUczWQCsL; ds_user_id=2905070466; sessionid=2905070466%3AGCFsB0dJqmQ5iM%3A27; shbts=1567053757.062416; rur=VLL; urlgen="{\"223.38.23.110\": 9644\054 \"222.107.238.125\": 4766}:1i3X5Y:mYd-4kQGJ7dlVJWfLs7pQvktegQ"', # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.89 Whale/1.6.81.8 Safari/537.36' # }, # endpoint = 'render.html', # args={'wait': 2.5} # ) hasNext = jsonResult['page_info']['has_next_page'] afterKey = jsonResult['page_info']['end_cursor'] if hasNext is not None: # self.first += 1 params = { 'tag_name': self.keyword, 'first': self.first, 'after': afterKey } # testUrl = 'https://www.instagram.com/graphql/query/?query_hash={}&variables={}'.format(response.meta['queryKey'], urllib.parse.quote_plus(str(params))) testUrl = 'https://www.instagram.com/graphql/query/?query_hash={}&variables={}'.format( self.hashKey, urllib.parse.quote_plus(str(params))) testUrl = re.sub(r"\+", "", testUrl) testUrl = re.sub(r'%27', '\"', testUrl) yield response.follow( url=testUrl, callback=self.realRealParse, meta={ 'queryKey': self.hashKey #response.meta['queryKey'] })
def realRealRealParse(self, response): print("*" * 100) print("*" * 100) print("*" * 100) print(json.loads(response.css("pre::text").get())) post = json.loads( response.css("pre::text").get())['data']['shortcode_media'] item = InstagramItem() try: item['text'] = post['edge_media_to_caption']['edges'][0]['node'][ 'text'] except: item['text'] = '' timestamp = post['taken_at_timestamp'] item['date'] = dt.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S') shortcode = post['shortcode'] item['shortcode'] = shortcode item['each_url'] = response.url item['like_count'] = post['edge_media_preview_like']['count'] if post['location']: locDic = {} locDic[post['location']['id']] = post['location']['name'] item['location'] = locDic item['address_json'] = post['location']['address_json'] else: item['location'] = '' item['address_json'] = '' mediaList = [] if post['__typename'] == 'GraphSidecar': for edge in post['edge_sidecar_to_children']['edges']: edgeDic = {} is_video = edge['node']['is_video'] edgeDic['is_video'] = is_video edgeDic['media_url'] = edge['node']['display_url'] if is_video: edgeDic["video_view_count"] = edge['node'][ "video_view_count"] edgeDic['explain'] = 'Video' else: edgeDic["video_view_count"] = -1 edgeDic['explain'] = edge['node']['accessibility_caption'] mediaList.append(edgeDic) else: mediaList = [] edgeDic = {} is_video = post['is_video'] edgeDic['is_video'] = is_video edgeDic['media_url'] = post['display_url'] if is_video: edgeDic["video_view_count"] = post["video_view_count"] edgeDic['explain'] = 'Video' else: edgeDic["video_view_count"] = -1 edgeDic['explain'] = post['accessibility_caption'] mediaList.append(edgeDic) item['mediaList'] = mediaList yield item