def parse_page(self, response): ''' parse page does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) retrieves not-replied-to comments ''' #loads replied-to comments pages path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' for reply in response.xpath(path): source = reply.xpath( "substring-before(.//h3/a/@href, concat(substring('&', 1 div contains(.//h3/a/@href, 'profile.php')), substring('?', 1 div not(contains(.//h3/a/@href, 'profile.php')))))" ).extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment @ page {}'.format( str(response.meta['index']), ans)) yield scrapy.Request(ans, callback=self.parse_reply, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init' }) #loads regular comments if not response.xpath(path): path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment @ page {}'.format( i, response.url)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath( 'source', "substring-before(.//h3/a/@href, concat(substring('&', 1 div contains(.//h3/a/@href, 'profile.php')), substring('?', 1 div not(contains(.//h3/a/@href, 'profile.php')))))" ) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() #previous comments if not response.xpath(path): for next_page in response.xpath( './/div[contains(@id,"see_next")]'): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'index': 1})
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' # from scrapy.utils.response import open_in_browser # open_in_browser(response) if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to proper page: {}' .format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) elif response.meta['flag'] == 'back': #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) # ============================================================================= # CRAWL REACTIONS # ============================================================================= # def parse_reactions(self,response): # new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item']) # new.context['lang'] = self.lang # new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") # new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") # new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") # new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") # new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") # new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") # yield new.load_item() # # #substitute # yield new.load_item() # ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾ # _________v___ # #response --> reply/root # reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href") # reactions = response.urljoin(reactions[0].extract()) # if reactions: # yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) # else: # yield new.load_item()
def parse_post(self, response): ''' parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page ''' #load replied-to comments pages #select nested comment one-by-one matching with the index: response.meta['index'] path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' group_flag = response.meta[ 'group'] if 'group' in response.meta else None for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment'.format( str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, priority=1000, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init', 'group': group_flag }) #load regular comments if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment'.format(i + 1)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') profile_img = "" # source_url = str(response.xpath(".//h3/a/@href").get()) # index1 = source_url.find("/profile.php?id=") # if index1 != -1: # index2 = source_url.find("&fref=nf&refid=18&__tn__=") # if index2 == -1: # index2 = source_url.find("&refid=18&__tn__=") # source_url = source_url[index1+16:index2] # profile_img = "https://graph.facebook.com/{}/picture?type=large".format(source_url) # else: # index2 = source_url.find("?fref=nf&refid=18&__tn__=-R") # source_url = source_url[1:index2] # profile_img = "https://avatars.io/facebook/{}".format(source_url) # new._add_value('source_url', source_url) new._add_value('profile_img', profile_img) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('img', './/div[h3]/div[2]/img/@src') # new.add_xpath('date','.//abbr/text()') date_string = response.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) new._add_value('date', date) new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') # new.add_value('url',response.url) yield new.load_item() #new comment page if not response.xpath(path): #for groups next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': 1 }) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': group_flag })
def parse_reply(self,response): ''' parse reply to comments, root comment is added if flag ''' # from scrapy.utils.response import open_in_browser # open_in_browser(response) if response.meta['flag'] == 'init': #parse root comment for root in response.xpath(xAll_ROOT_DIV): new = ItemLoader(item=CommentsItem(),selector=root) new.context['lang'] = self.lang new.add_xpath('source',xREPLY_['attributes']['source']) new.add_xpath('source_url',xREPLY_['attributes']['source_url']) new.add_value('reply_to','ROOT') new.add_xpath('text',xREPLY_['attributes']['text_root']) new.add_xpath('date',xREPLY_['attributes']['date']) new.add_xpath('reactions',xREPLY_['attributes']['reactions']) new.add_value('url',response.url) yield new.load_item() #parse all replies in the page for reply in response.xpath(xAll_REPLIES_DIV): new = ItemLoader(item=CommentsItem(),selector=reply) new.context['lang'] = self.lang new.add_xpath('source_url',xREPLY_['attributes']['source_url']) new.add_value('reply_to',response.meta['reply_to']) new.add_xpath('text',xREPLY_['attributes']['text_child']) new.add_xpath('date',xREPLY_['attributes']['date']) new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') new.add_value('url',response.url) yield new.load_item() back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority = 1000, meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], 'index':response.meta['index'], 'group':response.meta['group']}) else: next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={'index':response.meta['index']+1, 'group':response.meta['group']}) elif response.meta['flag'] == 'back': #parse all comments for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): new = ItemLoader(item=CommentsItem(),selector=reply) new.context['lang'] = self.lang new.add_xpath('source',xREPLY_['attributes']['source']) new.add_xpath('source_url',xREPLY_['attributes']['source_url']) new.add_value('reply_to',response.meta['reply_to']) new.add_xpath('text',xREPLY_['attributes']['text_child']) new.add_xpath('date',xREPLY_['attributes']['date']) new.add_xpath('reactions',xREPLY_['attributes']['reactions']) new.add_value('url',response.url) yield new.load_item() #keep going backwards back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={'reply_to':response.meta['reply_to'], 'flag':'back', 'url':response.meta['url'], 'index':response.meta['index'], 'group':response.meta['group']}) else: next_reply = response.meta['url'] self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={'index':response.meta['index']+1, 'group':response.meta['group']})
def parse_post(self, response): ''' parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page ''' #load replied-to comments pages #select nested comment one-by-one matching with the index: response.meta['index'] path = xNESTED_COMMENT_['root'] % (str(response.meta['index'])) group_flag = response.meta['group'] if 'group' in response.meta else None for reply in response.xpath(path): source = reply.xpath( xNESTED_COMMENT_['attributes']['source'] ).extract() answer = reply.xpath(xNESTED_COMMENT_['attributes']['answer']).extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment'.format(str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, priority=1000, meta={'reply_to':source, 'url':response.url, 'index':response.meta['index'], 'flag':'init', 'group':group_flag}) #load regular comments if not response.xpath(path): #prevents from exec path2 = xREGULAR_COMMENT_['root'] for i,reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment'.format(i+1)) new = ItemLoader(item=CommentsItem(),selector=reply) new.context['lang'] = self.lang new.add_xpath('source',xREGULAR_COMMENT_['attributes']['source']) new.add_xpath('source_url',xREGULAR_COMMENT_['attributes']['source_url']) new.add_xpath('text',xREGULAR_COMMENT_['attributes']['text']) new.add_xpath('date',xREGULAR_COMMENT_['attributes']['date']) new.add_xpath('reactions',xREGULAR_COMMENT_['attributes']['reactions']) new.add_value('url',response.url) yield new.load_item() #new comment page if not response.xpath(path): #for groups next_xpath = xNEXT_COMMENTS_['root'] prev_xpath = xPREV_COMMENTS_DIV if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath(xNEXT_COMMENTS_['attributes']['new_page']).extract() new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={'index':1, 'group':1}) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info('New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={'index':1, 'group':group_flag})
def parse_post(self, response): ''' parse post does multiple things: 1) loads replied-to-comments page one-by-one 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) goes to new comment page ''' #load replied-to comments pages path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' group_flag = response.meta[ 'group'] if 'group' in response.meta else None post_id = response.meta['post_id'] current_date = response.meta['current_date'] for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment'.format( str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, priority=1000, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init', 'group': group_flag, 'post_id': post_id, 'current_date': current_date }) #load regular comments if not response.xpath(path): path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment'.format(i + 1)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_value('reply_to', 'ROOT') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) new.add_value('post_id', post_id) yield new.load_item() #new comment page if not response.xpath(path): next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': 1, 'post_id': post_id, 'current_date': current_date }) if self.date >= current_date: raise CloseSpider('Reached date: {}'.format(self.date)) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': group_flag, 'post_id': post_id, 'current_date': current_date }) if self.date >= current_date: raise CloseSpider('Reached date: {}'.format(self.date))
def parse_page(self, response): ''' parse page does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page ''' #load replied-to comments pages #select nested comment one-by-one matching with the index: response.meta['index'] path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' group_flag = response.meta[ 'group'] if 'group' in response.meta else None for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment @ page {}'.format( str(response.meta['index']), ans)) yield scrapy.Request(ans, callback=self.parse_reply, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init', 'group': group_flag }) #load regular comments if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment @ page {}'.format( i, response.url)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() #new comment page if not response.xpath(path): #for groups next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={ 'index': 1, 'group': 1 }) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={ 'index': 1, 'group': group_flag })
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=100, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to proper page: {}' .format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_page, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) elif response.meta['flag'] == 'back': #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) yield new.load_item() #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=100, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_page, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) # ============================================================================= # CRAWL REACTIONS # ============================================================================= # def parse_reactions(self,response): # new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item']) # new.context['lang'] = self.lang # new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") # new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") # new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") # new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") # new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") # new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") # yield new.load_item() # # #substitute # yield new.load_item() # ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾ # _________v___ # #response --> reply/root # reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href") # reactions = response.urljoin(reactions[0].extract()) # if reactions: # yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) # else: # yield new.load_item()
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_value('url', response.url) yield new.load_item() #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_value('url', response.url) yield new.load_item() back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, trying to go back') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=100, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request( next_reply, dont_filter=True, callback=self.parse_page, meta={'index': response.meta['index'] + 1}) elif response.meta['flag'] == 'back': #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_value('url', response.url) yield new.load_item() #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, trying to go back') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=100, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request( next_reply, dont_filter=True, callback=self.parse_page, meta={'index': response.meta['index'] + 1})
def parse_reply(self, response): """ parse reply to comments, root comment is added if flag """ # from scrapy.utils.response import open_in_browser # open_in_browser(response) if response.meta["flag"] == "init": # parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context["lang"] = self.lang new.add_xpath("source", ".//h3/a/text()") new.add_xpath("source_url", ".//h3/a/@href") new.add_value("reply_to", "ROOT") new.add_xpath("text", ".//div[1]//text()") new.add_xpath("date", ".//abbr/text()") new.add_xpath( "reactions", './/a[contains(@href,"reaction/profile")]//text()') new.add_value("post_id", response.meta["post_id"]) new.add_value("url", response.url) yield new.load_item() # parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context["lang"] = self.lang new.add_xpath("source", ".//h3/a/text()") new.add_xpath("source_url", ".//h3/a/@href") new.add_value("reply_to", response.meta["reply_to"]) new.add_xpath("text", ".//div[h3]/div[1]//text()") new.add_xpath("date", ".//abbr/text()") new.add_xpath( "reactions", './/a[contains(@href,"reaction/profile")]//text()') new.add_value("post_id", response.meta["post_id"]) new.add_value("url", response.url) yield new.load_item() back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info("Back found, more nested comments") back_page = response.urljoin(back[0]) yield scrapy.Request( back_page, callback=self.parse_reply, priority=1000, meta={ "reply_to": response.meta["reply_to"], "flag": "back", "url": response.meta["url"], "index": response.meta["index"], "post_id": response.meta["post_id"], "group": response.meta["group"], }, ) else: next_reply = response.meta["url"] self.logger.info( "Nested comments crawl finished, heading to proper page: {}" .format(response.meta["url"])) yield scrapy.Request( next_reply, callback=self.parse_post, meta={ "index": response.meta["index"] + 1, "post_id": response.meta["post_id"], "group": response.meta["group"], }, ) elif response.meta["flag"] == "back": # parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context["lang"] = self.lang new.add_xpath("source", ".//h3/a/text()") new.add_xpath("source_url", ".//h3/a/@href") new.add_value("reply_to", response.meta["reply_to"]) new.add_xpath("text", ".//div[h3]/div[1]//text()") new.add_xpath("date", ".//abbr/text()") new.add_xpath( "reactions", './/a[contains(@href,"reaction/profile")]//text()') new.add_value("post_id", response.meta["post_id"]) new.add_value("url", response.url) yield new.load_item() # keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info("Back found, more nested comments") if back: back_page = response.urljoin(back[0]) yield scrapy.Request( back_page, callback=self.parse_reply, priority=1000, meta={ "reply_to": response.meta["reply_to"], "flag": "back", "url": response.meta["url"], "index": response.meta["index"], "post_id": response.meta["post_id"], "group": response.meta["group"], }, ) else: next_reply = response.meta["url"] self.logger.info( "Nested comments crawl finished, heading to home page: {}". format(response.meta["url"])) yield scrapy.Request( next_reply, callback=self.parse_post, meta={ "index": response.meta["index"] + 1, "post_id": response.meta["post_id"], "group": response.meta["group"], }, )
def parse_post(self, response): """ parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page """ # load replied-to comments pages # select nested comment one-by-one matching with the index: response.meta['index'] # self.logger.info(response.url) path = ( './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + "[" + str(response.meta["index"]) + "]") # testpath = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]][1]' testpath = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1))]/@id' bomb = 0 try: if response.xpath(testpath) != []: # self.logger.info(response.meta['testpath']) # self.logger.info(response.xpath(testpath).extract()) if response.meta["testpath"] == response.xpath( testpath).extract(): bomb = 1 except Exception: self.logger.info("seems ok to continue") try: # self.logger.info(response.meta['post_id']) post_id = response.meta["post_id"] except Exception: self.logger.info("single post") testpath = response.xpath(testpath).extract() # try: # post_div = response.xpath('//div[contains(@data-ft,"top_level_post_id")]') # many_features = post_div.xpath('./@data-ft').get() # except Exception: # many_features = '' group_flag = response.meta[ "group"] if "group" in response.meta else None if bomb == 0: for reply in response.xpath(path): source = reply.xpath(".//h3/a/text()").extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info("{} nested comment".format( str(response.meta["index"]))) yield scrapy.Request( ans, callback=self.parse_reply, priority=1000, meta={ "reply_to": source, "url": response.url, "index": response.meta["index"], "flag": "init", "post_id": response.meta["post_id"], "group": group_flag, }, ) # load regular comments if not response.xpath(path): # prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info("{} regular comment".format(i + 1)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context["lang"] = self.lang new.add_xpath("source", ".//h3/a/text()") new.add_xpath("source_url", ".//h3/a/@href") new.add_xpath("text", ".//div[h3]/div[1]//text()") new.add_xpath("date", ".//abbr/text()") new.add_xpath( "reactions", './/a[contains(@href,"reaction/profile")]//text()') new.add_value("post_id", post_id) new.add_value("url", response.url) yield new.load_item() # new comment page if not response.xpath(path): # for groups next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath(".//@href").extract() new_page = response.urljoin(new_page[0]) self.logger.info( "New page to be crawled {}".format(new_page)) yield scrapy.Request( new_page, callback=self.parse_post, meta={ "index": 1, "post_id": post_id, "testpath": testpath, "group": 1, }, ) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath(".//@href").extract() new_page = response.urljoin(new_page[0]) self.logger.info( "New page to be crawled {}".format(new_page)) yield scrapy.Request( new_page, callback=self.parse_post, meta={ "index": 1, "post_id": post_id, "testpath": testpath, "group": group_flag, }, )
def parse_reply(self, response): ''' parse reply to comments, root comment is added if flag ''' # from scrapy.utils.response import open_in_browser # open_in_browser(response) if response.meta['flag'] == 'init': #parse root comment for root in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=root) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', 'ROOT') new.add_xpath('text', './/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) #response --> reply/root """ PROFILE REACTIONS SECTION (REPEAT SEE LINE 176 ) the only difference is that, when getting the item temporarily the selector is the root instead of the reply, (it matches the for loop) """ #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] print('profile', profile) #print('new item', new.get_collected_values('name')) item = new.load_item() check = 0 if profile: check += 1 yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) #reactions = new.get_value('reactions') #print("reactions",reactions) temp = ItemLoader(item=CommentsItem(), selector=root) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: yield item #parse all replies in the page for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) """ PROFILE REACTIONS SECTION SECTION (REPEAT SEE LINE 176) """ #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] #print('new item', new.get_collected_values('name')) item = new.load_item() check = 0 if profile: check += 1 yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) temp = ItemLoader(item=CommentsItem(), selector=reply) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: yield item back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() if back: self.logger.info('Back found, more nested comments') back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to proper page: {}' .format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] }) elif response.meta['flag'] == 'back': """ adds random time pauses to prevent blocking DOWNSIDE: the algorithm will go slower, but still runs pretty quickly the greater the length of time, the more likely you'll go undetected, but if you're using a large amount of data, this may be unreasonable """ #print("did we make it") r = randrange(0, 20) time.sleep(r) #parse all comments for reply in response.xpath( '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]' ): #print("reply") new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_value('reply_to', response.meta['reply_to']) new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) """ SECTION (REPEAT SEE LINE 176) """ profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) #print('profile', profile) #print('new item', new.get_collected_values('name')) check = 0 item = new.load_item() if profile: check += 1 print(1) yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) #response --> reply/root #print("before ", item) temp = ItemLoader(item=CommentsItem(), selector=reply) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 print(2) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: print(3) yield item #print("after ", item) #keep going backwards back = response.xpath( '//div[contains(@id,"comment_replies_more_1")]/a/@href' ).extract() self.logger.info('Back found, more nested comments') if back: back_page = response.urljoin(back[0]) yield scrapy.Request(back_page, callback=self.parse_reply, priority=1000, meta={ 'reply_to': response.meta['reply_to'], 'flag': 'back', 'url': response.meta['url'], 'index': response.meta['index'], 'group': response.meta['group'] }) else: next_reply = response.meta['url'] self.logger.info( 'Nested comments crawl finished, heading to home page: {}'. format(response.meta['url'])) yield scrapy.Request(next_reply, callback=self.parse_post, meta={ 'index': response.meta['index'] + 1, 'group': response.meta['group'] })
def parse_post(self, response): ''' parse post does multiple things: 1) loads replied-to-comments page one-by-one (for DFS) 2) call parse_reply on the nested comments 3) adds simple (not-replied-to) comments 4) follows to new comment page ''' #load replied-to comments pages #select nested comment one-by-one matching with the index: response.meta['index'] path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str( response.meta['index']) + ']' group_flag = response.meta[ 'group'] if 'group' in response.meta else None for reply in response.xpath(path): source = reply.xpath('.//h3/a/text()').extract() answer = reply.xpath( './/a[contains(@href,"repl")]/@href').extract() ans = response.urljoin(answer[::-1][0]) self.logger.info('{} nested comment'.format( str(response.meta['index']))) yield scrapy.Request(ans, callback=self.parse_reply, priority=1000, meta={ 'reply_to': source, 'url': response.url, 'index': response.meta['index'], 'flag': 'init', 'group': group_flag }) #load regular comments if not response.xpath(path): #prevents from exec path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' for i, reply in enumerate(response.xpath(path2)): self.logger.info('{} regular comment'.format(i + 1)) new = ItemLoader(item=CommentsItem(), selector=reply) new.context['lang'] = self.lang new.add_xpath('source', './/h3/a/text()') new.add_xpath('source_url', './/h3/a/@href') new.add_xpath('text', './/div[h3]/div[1]//text()') new.add_xpath('date', './/abbr/text()') new.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]//text()') new.add_value('url', response.url) """ PROFILE REACTIONS SECTION adds functionality for adding profile and specific reaction data gets the profile url, creates a new item if the profile exists, add info to new item and increment 'check' to signal that new information has been added to the item and it's already been yielded repeat this process for reactions """ #profile = response.xpath(".//h3/a/@href") #profile = response.urljoin(profile[0].extract()) profile = "https://mbasic.facebook.com" + new.get_collected_values( 'source_url')[0] #print('profile', profile) #print('new item', new.get_collected_values('name')) item = new.load_item() check = 0 if profile: check += 1 yield scrapy.Request(profile, callback=self.parse_profile, meta={'item': item}) temp = ItemLoader(item=CommentsItem(), selector=reply) temp.context['lang'] = self.lang temp.add_xpath( 'reactions', './/a[contains(@href,"reaction/profile")]/@href') reactions = temp.get_collected_values('reactions') if reactions: check += 1 reactions = "https://mbasic.facebook.com" + temp.get_collected_values( 'reactions')[0] temp = 0 yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': item}) if check == 0: yield item #new comment page if not response.xpath(path): #for groups next_xpath = './/div[contains(@id,"see_next")]' prev_xpath = './/div[contains(@id,"see_prev")]' if not response.xpath(next_xpath) or group_flag == 1: for next_page in response.xpath(prev_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': 1 }) else: for next_page in response.xpath(next_xpath): new_page = next_page.xpath('.//@href').extract() new_page = response.urljoin(new_page[0]) self.logger.info( 'New page to be crawled {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_post, meta={ 'index': 1, 'group': group_flag })