def download_image(self, response): # full_img_url = response.xpath('//a[contains(text(), "View")') img_urls = response.meta['img_urls'] remaining_images = response.meta['remaining_images'] new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item']) new.context['lang'] = self.lang CDN_DOMAIN_MATCH = 'https://scontent-tpe1-1.xx.fbcdn.net' for img_element in response.xpath('//img/@src'): img_url = img_element.extract() if 'scontent' in img_url: img_urls.append(img_url) break if len(remaining_images) > 0: img_url = remaining_images.pop() yield scrapy.Request(img_url, callback=self.download_image, meta={'remaining_images' : remaining_images, 'item': new, 'img_urls': img_urls, 'check_reactions': response.meta['check_reactions'] }) else: new.add_value('image_urls', img_urls) reaction_payload = response.meta['check_reactions'] if reaction_payload['check']: yield scrapy.Request(reaction_payload['url'], callback=self.parse_reactions, meta={'item':new}) else: yield new.load_item()
def parse_post(self, response): new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath( 'source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()" ) new.add_xpath( 'shared_from', '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()' ) new.add_xpath( 'post_text', '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()' ) #check reactions for old posts check_reactions = response.xpath( "//a[contains(@href,'reaction/profile')]/div/div/text()").get() if not check_reactions: yield new.load_item() else: reactions = response.xpath( "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href" ) reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={ 'item': new, 'current_date': response.meta['current_date'] })
def parse_post(self,response): new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') # new.add_xpath('date','//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') # Crawling links that lead to external websites try: internal_link = response.xpath("//div//a[contains(@href, '://')]/@href").get() internal_redirection_page = requests.get(internal_link) internal_redirection_page_content = internal_redirection_page.content.decode("utf-8").replace('\\', '') # find first link-like substring in content of the redirecting page (it includes links that get replaced like # "https://trib.al/<some_shortened_link>": shortened_external_link = re.findall('https:[a-zA-Z0-9/.?=\n_]*', internal_redirection_page_content)[0] # find first link-like substring in content external_link = requests.get(shortened_external_link).url new.add_value('link', external_link) except: new.add_value('link', '') #check reactions for old posts check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get() if not check_reactions: yield new.load_item() else: new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' #select all posts for post in response.xpath( "//div[contains(@id,'m_group_stories')]//div[contains(@data-ft,'top_level_post_id')]" ): new = ItemLoader(item=FbcrawlItem(), selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=1000, meta={'item': new}) #load following page #tries to click on "more" new_page = response.xpath( "//div[contains(@id,'m_group_stories')]/div[2]/a/@href").extract() if not new_page: self.logger.info('Crawling has finished with no errors!') else: self.logger.info('new page') self.k -= 1 new_page = response.urljoin(new_page[0]) yield scrapy.Request(new_page, callback=self.parse_page)
def parse_post(self, response): new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.add_xpath( 'source', "substring-before(.//div[1]/div/div/div/table//strong[1]/a[1]/@href, concat(substring('&', 1 div contains(.//div[1]/div/div/div/table//strong[1]/a[1]/@href, 'profile.php')), substring('?', 1 div not(contains(.//div[1]/div/div/div/table//strong[1]/a[1]/@href, 'profile.php')))))" ) new.add_xpath( 'shared_from', '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()' ) new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath( 'text', '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()' ) new.add_xpath( 'reactions', "//a[contains(@href,'reaction/profile')]/div/div/text()") new.add_value('url', response.url) reactions = response.xpath( "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href" ) reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': new})
def parse_group(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(), selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('membersgroup', "//td/span[contains(@id,'u_0_2')]/text()") new.add_xpath('photosgroup', "//td/span[contains(@id,'u_0_4')]/text()") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) #load following page #tries to click on "more", otherwise it looks for the appropriate #year for 1-click only and proceeds to click on others new_group = response.xpath( "//div[2]/a[contains(@href,'permalinks&refid')]/@href").extract() new_group = response.urljoin(new_group[0]) yield scrapy.Request(new_group, callback=self.parse_group)
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(),selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) #load following page #tries to click on "more", otherwise it looks for the appropriate #year for 1-click only and proceeds to click on others new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() if not new_page: if response.meta['flag'] == self.k and self.k >= self.year: self.logger.info('There are no more, flag set at = {}'.format(self.k)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Everything OK, new flag: {}'.format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: while not new_page: #sometimes the years are skipped self.logger.info('XPATH not found for year {}'.format(self.k-1)) self.k -= 1 self.logger.info('Trying with previous year, flag={}'.format(self.k)) if self.k < self.year: self.logger.info('The previous year to crawl is less than the parameter year: {} < {}'.format(self.k,self.year)) self.logger.info('This is not handled well, please re-run with -a year="{}" or less'.format(self.k)) break xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info('New page found with flag {}'.format(self.k)) new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Now going with flag {}'.format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info('Page scraped, click on more! flag = {}'.format(response.meta['flag'])) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) else: self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') self.logger.info('First page scraped, click on more! Flag not set, default flag = {}'.format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
def parse_post(self, response): new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.add_xpath( 'source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()" ) new.add_xpath( 'shared_from', '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()' ) new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath( 'text', '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()' ) new.add_xpath( 'reactions', "//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath( "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href" ) reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': new})
def parse_page(self, response): for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]" ): #select all posts new = ItemLoader(item=FbcrawlItem(), selector=post) new.add_xpath('comments', ".//div/a[contains(text(),'comment')]/text()") new.add_xpath('url', ".//a[contains(text(),'Notizia completa')]/@href") post = post.xpath(".//a[contains(text(),'Notizia completa')]/@href" ).extract() #returns full post-link in a list temp_post = response.urljoin(post[0]) yield scrapy.Request(temp_post, self.parse_post, dont_filter=True, meta={'item': new}) next_page = response.xpath("//div/a[contains(text(),'Altri')]/@href") if len(next_page) > 0: next_page = response.urljoin(next_page[0].extract()) yield scrapy.Request(next_page, callback=self.parse_page) else: next_page = response.xpath( "//div/a[contains(text(),'2017')]/@href") if len(next_page) > 0: next_page = response.urljoin(next_page[0].extract()) yield scrapy.Request(next_page, callback=self.parse_page)
def parse_reactions(self,response): new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item']) new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") yield new.load_item()
def parse_rispostina(self, response): for daje in response.xpath( "//div[contains(@id,'root')]/div/div/div"): #select all posts new = ItemLoader(item=FbcrawlItem(), selector=daje) new.add_xpath('source', ".//h3/a/text()") #| ./div/div/h3/a/text()") new.add_xpath( 'text', ".//span[not(contains(text(),' · ')) and not(contains(text(),'Visualizza'))]/text() | .//div/text()" ) yield new.load_item()
def parse_post(self, response): new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') # new.add_xpath('date','//div/div/abbr/text()') new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') #check reactions for old posts check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get() if check_reactions: new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") reactions = response.urljoin(reactions[0].extract()) reaction_payload = { 'check': True, 'url': reactions } else: reaction_payload = { 'check': False, } image_path = response.xpath('//div[@data-ft]/div[@class]/a/@href') if image_path.get() and self.parse_image: image_urls = image_path img_prefix = '/photo.php' found_img_urls = [] for selected_url in image_urls: url = selected_url.extract() if img_prefix in url: found_img_urls.append(response.urljoin(url)) if len(found_img_urls) > 0: first_url = found_img_urls.pop() yield scrapy.Request(first_url, callback=self.download_image, meta = {'remaining_images' : found_img_urls, 'item': new, 'img_urls': [], 'check_reactions': reaction_payload }) else: yield new.load_item() else: new.add_value('image_urls', []) if reaction_payload['check']: yield scrapy.Request(reaction_payload['url'], callback=self.parse_reactions, meta={'item':new}) else: yield new.load_item()
def parse_post(self, response): with open('comment_urls.csv', 'a+') as f: f.write(str(response.url) + '\n') new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.add_xpath( 'source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()" ) new.add_xpath( 'shared_from', '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()' ) new.add_xpath('date', '//div/div/abbr/text()') content = response.xpath( '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()' ).extract() contents = [] for c in range(0, len(content)): try: temp = content[c].replace(';', ' ') contents.append(temp) except: temp = content[c] contents.append(temp) new.add_value('text', contents) new.add_xpath( 'reactions', "//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath( "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href" ) reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': new})
def parse_page(self, response): for post in response.xpath( '//div[@id="MPhotoContent"]/div/div/div/div/div[not(contains(@id,"see"))]' ): #select all posts new = ItemLoader(item=FbcrawlItem(), selector=post) new.add_xpath('source', "./div/h3/a/text()") new.add_xpath( 'text', "div/div/span[not(contains(text(),' · '))]/text() | ./div/div/text()" ) yield new.load_item() rispostina = response.xpath( '//div/a[contains(text(),"rispost")]/@href') for i in range(len(rispostina)): risp = response.urljoin(rispostina[i].extract()) yield scrapy.Request(risp, callback=self.parse_rispostina) next_page = response.xpath("//div[contains(@id,'see_next')]/a/@href") if len(next_page) > 0: next_page = response.urljoin(next_page[0].extract()) yield scrapy.Request(next_page, callback=self.parse_page)
def parse_reactions(self, response): new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath('likes', "//a[contains(@href,'reaction_type=1')]/span/text()") new.add_xpath('haha', "//a[contains(@href,'reaction_type=4')]/span/text()") new.add_xpath('love', "//a[contains(@href,'reaction_type=2')]/span/text()") new.add_xpath('wow', "//a[contains(@href,'reaction_type=3')]/span/text()") new.add_xpath('sigh', "//a[contains(@href,'reaction_type=7')]/span/text()") new.add_xpath('angry', "//a[contains(@href,'reaction_type=8')]/span/text()") new.add_xpath('care', "//a[contains(@href,'reaction_type=16')]/span/text()") yield new.load_item() current_date = response.meta['current_date'] if self.date >= current_date: #time.sleep(60) raise CloseSpider('Reached date: {}'.format(self.date))
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' # #open page in browser for debug # from scrapy.utils.response import open_in_browser # open_in_browser(response) #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date2(date) current_date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(), selector=post) self.logger.info('Parsing post n = {}'.format(abs(self.count))) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_xpath('date', './@data-ft') new.add_xpath('post_id', './@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet, click once #and keep looking for "more" new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() if not new_page: if response.meta['flag'] == self.k and self.k >= self.year: self.logger.info('There are no more, flag set at = {}'.format( self.k)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Everything OK, new flag: {}'.format( self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'XPATH not found for year {}'.format(self.k - 1)) self.k -= 1 self.logger.info( 'Trying with previous year, flag={}'.format( self.k)) if self.k < self.year: self.logger.info( 'The previous year to crawl is less than the parameter year: {} < {}' .format(self.k, self.year)) self.logger.info( 'This is not handled well, please re-run with -a year="{}" or less' .format(self.k)) break xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info('New page found with flag {}'.format( self.k)) new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Now going with flag {}'.format(self.k)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, click on more! new_page = {} flag = {}'. format(new_page, date)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: # self.logger.info('FLAG DOES NOT ALWAYS REPRESENT ACTUAL YEAR') self.logger.info( 'First page scraped, click on more {}! Flag not set, default flag = {}' .format(new_page, date)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def parse_page(self, response): for post in response.xpath( "//article[contains(@data-ft,'top_level_post_id')]"): try: many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime( date.year, date.month, date.day) if date is not None else date date = str(date) #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(), selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'. format(abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_value('date', date) new.add_xpath('post_id', './@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath( ".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) except: continue #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #this literally only get the more button link if there exists one... #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() print(new_page) else: new_page = response.xpath( '//*[@id="structured_composer_async_container"]/div[2]/a/@href' ).extract() # with open('0.html', 'wb') as f: # f.write(response.body) #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ #'//*[@id="structured_composer_async_container"]/div[2]/a/@href' if not new_page or 'Recent' in str( response.xpath( '//*[@id="structured_composer_async_container"]/div[2]/a'). extract()): self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: # print('does this ever happen????') while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) # print(new_page) # print(new_page[0]) # sys.exit("this is what it looks like") if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): new = ItemLoader(item=FbcrawlItem(), selector=post) new.add_xpath('comments', "./div[2]/div[2]/a[1]/text()") new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") new.add_xpath('reactions', ".//a[contains(@aria-label,'reactions')]/text()") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) yield scrapy.Request(temp_post, self.parse_post, meta={'item': new}) #load following page next_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() if len(next_page) == 0: if response.meta['flag'] == 4 and self.year <= 2015: self.logger.info('2014 reached, flag = 5') next_page = response.xpath( "//div/a[contains(@href,'time') and contains(text(),'2015')]/@href" ).extract() self.logger.info('next_page = {}'.format(next_page[0])) new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': 5}) elif response.meta['flag'] == 3 and self.year <= 2015: self.logger.info('2015 reached, flag = 4') next_page = response.xpath( "//div/a[contains(@href,'time') and contains(text(),'2015')]/@href" ).extract() self.logger.info('next_page = {}'.format(next_page[0])) new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': 4}) elif response.meta['flag'] == 2 and self.year <= 2016: self.logger.info('2016 reached, flag = 3') next_page = response.xpath( "//div/a[contains(@href,'time') and contains(text(),'2016')]/@href" ).extract() self.logger.info('next_page = {}'.format(next_page[0])) new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': 3}) elif response.meta['flag'] == 1 and self.year <= 2017: self.logger.info('2017 reached, flag = 2') next_page = response.xpath( "//div/a[contains(@href,'time') and contains(text(),'2017')]/@href" ).extract() self.logger.info('next_page = {}'.format(next_page[0])) new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': 2}) elif response.meta['flag'] == 0 and self.year <= 2018: self.logger.info('2018 reached, flag = 1') next_page = response.xpath( "//div/a[contains(@href,'time') and contains(text(),'2018')]/@href" ).extract() self.logger.info('next_page = {}'.format(next_page[0])) new_page = response.urljoin(next_page[0]) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': 1}) else: new_page = response.urljoin(next_page[0]) if 'flag' in response.meta: yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': 0})
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' # #open page in browser for debug # from scrapy.utils.response import open_in_browser # open_in_browser(response) #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date,{'lang':self.lang}) current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string],{'lang':self.lang}) current_date = datetime(date.year,date.month,date.day) if date is not None else date date = str(date) #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(),selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_value('date',date) new.add_xpath('post_id','./@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if self.group == 1: new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() else: new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ if not new_page: self.logger.info('[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) self.k -= 1 if self.k < self.year: raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) else: self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
def parse_page(self, response): ''' Parse the given page for all the posts. Then recursively for another page. ''' #select all posts for post in response.xpath( "//article[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime(date.year, date.month, date.day) if date is not None else date date = str(date) #'date' argument is reached stop crawling, scraps more to capture all required data till date as spiders stops immediately #prev_date = self.date - d1.timedelta(days=1) #if prev_date >= current_date: # if self.date >= current_date: # time.sleep(60) # raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(), selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'.format( abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) new.add_xpath('comments', './footer/div[2]/a[1]/text()') new.add_xpath('reactions', "./footer/div[2]/span/a[1]/text()") new.add_xpath('content', ".//tr/td/h3/text()") #new.add_xpath('image',".//div/div[2]/div/a/@href") new.add_value('date', date) new.add_xpath('post_id', './@data-ft') story_url = url_strip( post.xpath(".//a[contains(@href,'footer')]/@href").extract()) post_url = response.urljoin(story_url) new.add_value('url', post_url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={ 'item': new, 'current_date': current_date }) #load following page, try to click on "more" #if "more" link not present look for the highest year not parsed yet #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: while not new_page: self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})