def l4_parse_poetry(self, response): ''' Level 4: parse poetry page, extract poetry, and save. ''' #print response.url try: # extract poem #stanza_selectors = response.xpath("//div[@class='poemContainer']/div[@class='PoemTextHost ']/div[@class='PoemDisplay OrgTextDisplay ']/div") stanza_selectors = response.xpath("//div[contains(@class,'poemContainer')]/div[contains(@class,'PoemTextHost')]/div[contains(@class,'PoemDisplay') and contains(@class,'OrgTextDisplay')]/div") poem = '' for s in stanza_selectors: line_selectors = s.xpath("./p") for l in line_selectors: line = l.xpath(".//text()").extract() line = ''.join(line) line = line.strip() ##print line poem = poem + line + '\n' poem = poem + '\n' poem = poem[0:-1]#strip last '\n' from the poem #print poem # extract title of the poem title = response.xpath("//div[@class='shayariContainerDiv']/div[@class='left_pan_shayari']/div[@class='shayari_first']/h1/text()").extract()[0] # extract poet name #poet = response.xpath("//div[@class='artist_img_descrpt']/div[@class='about_artist']/h2/text()").extract()[0] # poet name must be in english, that is why we have to discard the previous one. poet_href = response.xpath("//div[@class='artist_img']//a/@href").extract()[0] p = re.compile(ur'poets/(.+)/') #href="/poets/anjum-tarazi/?lang=Hi" poet = p.search(poet_href).group(1) poet = poet.replace('-', ' ') # check response.url for language information: https://.....xyz/?lang=hi tmp = response.url tmp = tmp.split('?') url = tmp[0] language = tmp[1].split('=')[1] data = {} data['poem'] = poem data['url'] = url data['title'] = title.strip() data['author'] = poet.title() data['language'] = language # Store these information in DB save_to_db_poem(data) except: print("ERROR: l4_parse_poetry: Unexpected error:", sys.exc_info()[0]) _trace = '' for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print ("DBG:: Error in %s on line %d" % (fname, lineno)) _trace = _trace + "Error in %s on line %d" % (fname, lineno) with open(self.LOGFILE, "a") as outfile: t = time.asctime( time.localtime(time.time()) ) json.dump({'link': response.url, 'error': 'parsing failed', 'trace': _trace, 'time': t}, outfile, indent=4)
def l4_parse_poetry(self, response): ''' Level 4: parse poetry page, extract poetry, and save. ''' #print response.url try: # extract data content = response.xpath("//div[@id='ImageHost']//div[@class='PoemDisplay']/span").extract()[0] title = response.xpath("//div[@class='shayariContainerDiv']/div[@class='left_pan_shayari']/div[@class='shayari_first']/h1/text()").extract()[0] #poet = response.xpath("//div[@class='artist_img_descrpt']/div[@class='about_artist']/h2/text()").extract()[0] # poet name must be in english, that is why we have to discard the previous one. poet_href = response.xpath("//div[@class='artist_img']//a/@href").extract()[0] p = re.compile(ur'poets/(.+)/') #href="/poets/anjum-tarazi/?lang=Hi" poet = p.search(poet_href).group(1) poet = poet.replace('-', ' ') # remove outer span tag poem = remove_tags(content, 'span') except: print "ERROR: l4_parse_poetry: failed to extract data" with open("exception_poetry.txt", "a") as outfile: json.dump({'link': response.url, 'error': 'parsing failed'}, outfile, indent=4) try: # check response.url for language information: https://.....xyz/?lang=hi tmp = response.url tmp = tmp.split('?') url = tmp[0] language = tmp[1].split('=')[1] # Correct the line order of the poem; and save refined_poem = refine_poetry(poem, url, language) if refined_poem: data = {} data['poem'] = refined_poem data['url'] = url data['title'] = title data['author'] = poet.title() data['language'] = language # Store these information in DB save_to_db_poem(data) else: with open("exception_poetry.txt", "a") as outfile: json.dump({'link': response.url, 'error': 'refine_poetry failed'}, outfile, indent=4) except: print("ERROR: l4_parse_poetry: Unexpected error:", sys.exc_info()[0]) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print ("DBG:: Error in %s on line %d" % (fname, lineno)) with open("exception_poetry.txt", "a") as outfile: json.dump({'link': response.url, 'error': sys.exc_info()[0]}, outfile, indent=4)
def l3_parse_article_page(self,response): """ First check for the page containing div[@class='poem'] in the XPATH 1. If found then extract the poem and save in the database 2. If not found call l2_parse_author_page again because it contains list of poems in a journal """ try: print "DBG:: l3_parse_article_page: Extracting poem from Article page" p = response.xpath("//div[@id='mw-content-text']/div[@class='poem']//p").extract() poem = " ".join(x.encode('utf-8') for x in p) try: h1 = response.xpath("//h1[@id='firstHeading']//text()").extract()[0].encode('utf-8') title = h1 author = h1.split('/')[-1] data = {} data['poem'] = poem data['url'] = response.url.encode('utf-8') data['title'] = title data['author'] = author.title() data['language'] = 'hi'# Content of this site are in hindi # Store these information in DB save_to_db_poem(data) except: print "ERROR:: l3_parse_article_page: Title not found" except: # Extract article links from the Author page and generate request for each try: print "DBG:: l3_parse_article_page: Extracting poem links from Author page" articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract() articles_links = [self.domain_name+x for x in articles] for url in articles_links: # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): #print "Visiting Article: ", url yield scrapy.Request(url, callback=self.l3_parse_article_page) except: print "DBG:: Nothing found in Author page!!!" print("ERROR: l3_parse_article_page: Unexpected error:", sys.exc_info()[0]) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print ("DBG:: Error in %s on line %d" % (fname, lineno))
def parse_article_page(self, response): self.count_visit_article = self.count_visit_article + 1 try: print "Extracting poem ", self.count_visit_article, " from Article page" p = response.xpath("/html/body/table//tr[2]/td/table//tr/td/pre").extract() poem = " ".join(x.encode('utf-8') for x in p) if poem: title = response.xpath("/html/body/table//tr[2]/td/table//tr/td/p/strong/text()").extract()[0].encode('utf-8') self.count_articles = self.count_articles + 1 data = {} data['index'] = self.count_articles data['title'] = title data['author'] = '' data['poem'] = poem data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_poem(data) else: print "First method failed trying another xpath" p = response.xpath("/html/body/center/p").extract() poem = " ".join(x.encode('utf-8') for x in p) if poem: title = response.xpath("/html/body/center/p[1]/strong/text()").extract()[0].encode('utf-8') author = response.xpath("/html/body/center/p[1]/a/em/text()").extract()[0].encode('utf-8') self.count_articles = self.count_articles + 1 data = {} data['index'] = self.count_articles data['title'] = "".join(x.encode('utf-8') for x in title) data['author'] = "".join(x.encode('utf-8') for x in author) data['poem'] = poem data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_poem(data) else: print "Both method failed write it in file for further processing" except: print "Error Article page!!!" with open("up_exception_article.txt", "a") as outfile: json.dump({'index': self.count_visit_article, 'link': response.url}, outfile, indent=4)
def parse_article_page(self, response): self.count_visit_article = self.count_visit_article + 1 try: print "Extracting poem ", self.count_visit_article, " from Article page" p = response.xpath( "/html/body/table//tr[2]/td/table//tr/td/pre").extract() poem = " ".join(x.encode('utf-8') for x in p) if poem: title = response.xpath( "/html/body/table//tr[2]/td/table//tr/td/p/strong/text()" ).extract()[0].encode('utf-8') self.count_articles = self.count_articles + 1 data = {} data['index'] = self.count_articles data['title'] = title data['author'] = '' data['poem'] = poem data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_poem(data) else: print "First method failed trying another xpath" p = response.xpath("/html/body/center/p").extract() poem = " ".join(x.encode('utf-8') for x in p) if poem: title = response.xpath( "/html/body/center/p[1]/strong/text()").extract( )[0].encode('utf-8') author = response.xpath( "/html/body/center/p[1]/a/em/text()").extract( )[0].encode('utf-8') self.count_articles = self.count_articles + 1 data = {} data['index'] = self.count_articles data['title'] = "".join(x.encode('utf-8') for x in title) data['author'] = "".join(x.encode('utf-8') for x in author) data['poem'] = poem data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_poem(data) else: print "Both method failed write it in file for further processing" except: print "Error Article page!!!" with open("up_exception_article.txt", "a") as outfile: json.dump( { 'index': self.count_visit_article, 'link': response.url }, outfile, indent=4)
def parse_poetry(self, response): ''' Parse poetry page, extract poetry, and save. ''' self.logger.debug("parse_poetry: IN.") try: # extract poem stanza_selectors = response.xpath( "//div[contains(@class,'mainContentBody')]/div[contains(@class,'poemPageContentBody')]/div/div" ) poem = '' for s in stanza_selectors: line_selectors = s.xpath(".//p") for l in line_selectors: line = l.xpath(".//text()").extract() line = ''.join(line) line = line.strip() ##print line poem = poem + line + '\n' poem = poem + '\n' poem = poem[0:-1] #strip last '\n' from the poem #print poem # extract title of the poem title = response.xpath( "//div[contains(@class,'mainContentBody')]/div[contains(@class,'poemPageContentHeader')]/h1/text()" ).extract()[0] # extract poet name # poet name must be in english. poet_href = response.xpath( "//div[contains(@class,'mainContentBody')]/div[contains(@class,'poemPageContentHeader')]//a[contains(@class,'ghazalAuthor')]/@href" ).extract()[0] p = re.compile( ur'poets/(.+)/') #href="/poets/anjum-tarazi/?lang=Hi" poet = p.search(poet_href).group(1) poet = poet.replace('-', ' ') # check response.url for language information: https://.....xyz/?lang=hi tmp = response.url tmp = tmp.split('?') url = tmp[0] language = tmp[1].split('=')[1] data = {} data['poem'] = poem data['url'] = url data['title'] = title.strip() data['author'] = poet.title() data['language'] = language # Store these information in DB save_to_db_poem(data) except: self.logger.error("parse_poetry: %s", sys.exc_info()[0]) _trace = '' for frame in traceback.extract_tb(sys.exc_info()[2]): fname, lineno, fn, text = frame self.logger.error("error in %s on line %d" % (fname, lineno)) _trace = _trace + "error in %s on line %d" % (fname, lineno) with open(self.LOGFILE, "a") as outfile: t = time.asctime(time.localtime(time.time())) json.dump( { 'link': response.url, 'error': 'parsing poetry failed', 'trace': _trace, 'time': t }, outfile, indent=4)
def parse_poetry_page(self, response): """ Parse the poetry page 1. First check if the page contains a poetry or not. 2. If Poetry not found, call parse_url_list because it may contain list of poems/collections/books. 3. If Poetry found, then extract the poem and save in the database. """ self.logger.debug("parse_poetry_page: IN.") self.crawler.stats.inc_value('kangaroo/poetry_page_visit') flag_poetry_found = True ## # Find out if the page contains a Poetry try: p = response.xpath( "//div[@id='mw-content-text']/div[@class='poem']//p").extract( ) if len(p) is 0: # in some pages, the poetry is not in div[@class='poem'] # e.g. http://www.kavitakosh.org/kk/%E0%A4%AE%E0%A4%A8_%E0%A4%B2%E0%A4%BE%E0%A4%97%E0%A5%8D%E0%A4%AF%E0%A5%8B_%E0%A4%AE%E0%A5%87%E0%A4%B0%E0%A5%8B_%E0%A4%AF%E0%A4%BE%E0%A4%B0_%E0%A4%AB%E0%A4%BC%E0%A4%95%E0%A5%80%E0%A4%B0%E0%A5%80_%E0%A4%AE%E0%A5%87%E0%A4%82_/_%E0%A4%95%E0%A4%AC%E0%A5%80%E0%A4%B0 p = response.xpath("//div[@id='mw-content-text']//p").extract() if len(p): # Now check for the length of the text under the <p> # Because it may contains empty <p>, or one line text stating poetry is not available etc. # Here we assume that the length of Poetry should be greater than ARTICLE_MIN_LEN. p_t = response.xpath( "//div[@id='mw-content-text']//p/text()").extract() p_t = "".join(x.encode('utf-8') for x in p_t) if len(p_t) <= ARTICLE_MIN_LEN: flag_poetry_found = Flase else: flag_poetry_found = Flase else: # Check the length of the article p_t = response.xpath( "//div[@id='mw-content-text']/div[@class='poem']//p/text()" ).extract() p_t = "".join(x.encode('utf-8') for x in p_t) if len(p_t) <= ARTICLE_MIN_LEN: flag_poetry_found = Flase except: self.logger.error("parse_poetry_page: xpath error.") flag_poetry_found = False ## # If poetry not found... if (flag_poetry_found is False): # It may contains list of poems/collections/books self.logger.info( 'parse_poetry_page: no poetry found on this page.') return self.parse_url_list(response) ## # If Poetry found... # This is poetry page, extract the poetry try: h1 = response.xpath("//h1[@id='firstHeading']//text()").extract( )[0].encode('utf-8') h1_list = h1.split('/') title = '/'.join(h1_list[:-1]) poet = h1_list[-1] # Process and create Poetry poem = " ".join(x.encode('utf-8') for x in p) data = {} data['poem'] = poem data['url'] = response.url.encode('utf-8') data['title'] = title data['author'] = poet.strip() data['language'] = 'hi' # Content of this site are in hindi # Store these information in DB save_to_db_poem(data) self.crawler.stats.inc_value('kangaroo/poetry_found') except: self.logger.error("parse_poetry_page: %s", sys.exc_info()[0]) _trace = '' for frame in traceback.extract_tb(sys.exc_info()[2]): fname, lineno, fn, text = frame self.logger.error("error in %s on line %d" % (fname, lineno)) _trace = _trace + "error in %s on line %d" % (fname, lineno) with open(self.LOGFILE, "a") as outfile: t = time.asctime(time.localtime(time.time())) json.dump( { 'link': response.url, 'error': 'parsing poetry failed', 'trace': _trace, 'time': t }, outfile, indent=4)