def parse_poetry_list(self, response): ''' Parse poet's poetry list ''' i = response.url.find('&id=') self.logger.debug('parse_poetry_list: %s', response.url[i + 4:]) data = json.loads(response.body) poetries = data['Data'] total = data['Total'] errors = data['Errors'] count = len(poetries) self.logger.info('parse_poetry_list: result has %d of %d poetries', count, total) for poetry in poetries: # extract info of the poetry, and crawl poetry page #print poetry content_slug = poetry['ContentSlug'] type_slug = poetry['TypeSlug'] # Crate url url = self.domain_name + type_slug + '/' + content_slug # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): url_t = url + '?lang=' + lang yield scrapy.Request(url_t, callback=self.parse_poetry)
def l3_parse_poetry_list(self, response): ''' Level 3: parse poet's poetry list ''' i = response.url.find('&id=') self.logger.debug('l3_parse_poetry_list: %s', response.url[i+4:]) data = json.loads(response.body) poetries = data['Data'] total = data['Total'] errors = data['Errors'] count = len(poetries) self.logger.info('l3_parse_poetry_list: result has %d of %d poetries', count, total) for poetry in poetries: # extract info of the poetry, and crawl poetry page #print poetry content_slug = poetry['ContentSlug'] type_slug = poetry['TypeSlug'] # Crate url url = self.domain_name + type_slug + '/' + content_slug # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): url_t = url + '?lang=' + lang yield scrapy.Request(url_t, callback=self.l4_parse_poetry)
def parse_url_list(self, response): ''' Find out the poem/collections/book list, generate request for each item. ''' self.logger.debug("parse_url_list: extracting poem/collections links.") self.crawler.stats.inc_value('kangaroo/list_page_visit') try: # All URLs in main body content div i.e. mw-content-text urls_all = response.xpath( "//div[@id='mw-content-text']//a/@href").extract() urls_all = set(urls_all) #print len(urls_all) # Exclude the URLs present in breadcrumbs urls_exclude = response.xpath( "//div[@id='mw-content-text']//div[@id='kkrachna' or @class='kkrachna' or @id='extrainfobox' or @class='noarticletext']//a/@href" ).extract() urls_exclude = set(urls_exclude) #print len(urls_exclude) urls = urls_all - urls_exclude self.logger.debug("parse_url_list: %s urls found." % len(urls)) except: print("ERROR: parse_url_list: ", sys.exc_info()[0]) _trace = '' for frame in traceback.extract_tb(sys.exc_info()[2]): fname, lineno, fn, text = frame print("ERROR: error in %s on line %d" % (fname, lineno)) _trace = _trace + "error in %s on line %d" % (fname, lineno) with open(self.LOGFILE, "a") as outfile: t = time.asctime(time.localtime(time.time())) json.dump( { 'link': response.url, 'error': 'parse_url_list failed', 'trace': _trace, 'time': t }, outfile, indent=4) urls = [self.domain_name + x for x in urls] for url in urls: # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): yield scrapy.Request(url, callback=self.parse_poetry_page)
def l3_parse_article_page(self,response): """ First check for the page containing div[@class='poem'] in the XPATH 1. If found then extract the poem and save in the database 2. If not found call l2_parse_author_page again because it contains list of poems in a journal """ try: print "DBG:: l3_parse_article_page: Extracting poem from Article page" p = response.xpath("//div[@id='mw-content-text']/div[@class='poem']//p").extract() poem = " ".join(x.encode('utf-8') for x in p) try: h1 = response.xpath("//h1[@id='firstHeading']//text()").extract()[0].encode('utf-8') title = h1 author = h1.split('/')[-1] data = {} data['poem'] = poem data['url'] = response.url.encode('utf-8') data['title'] = title data['author'] = author.title() data['language'] = 'hi'# Content of this site are in hindi # Store these information in DB save_to_db_poem(data) except: print "ERROR:: l3_parse_article_page: Title not found" except: # Extract article links from the Author page and generate request for each try: print "DBG:: l3_parse_article_page: Extracting poem links from Author page" articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract() articles_links = [self.domain_name+x for x in articles] for url in articles_links: # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): #print "Visiting Article: ", url yield scrapy.Request(url, callback=self.l3_parse_article_page) except: print "DBG:: Nothing found in Author page!!!" print("ERROR: l3_parse_article_page: Unexpected error:", sys.exc_info()[0]) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print ("DBG:: Error in %s on line %d" % (fname, lineno))
def l2_parse_author_page(self, response): """ Parse the author page 1. Extract Date of birth and date of death 2. Extract year of birth and death 3. Save Author details 4. Crawl further to scrap his/her articles """ name = None # date of birth birth = None # date of death death = None try: name = response.xpath("//h1[@id='firstHeading']//text()").extract()[0] #print name except: print "################################## name error #####################" try: birth = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dob']/text()").extract()[0] #print dob except: pass try: death = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dod']/text()").extract()[0] #print dod except: pass data = {} data['name'] = name data['birth'] = birth data['death'] = death data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_author(data) ## # Parse the page, find out the articles list, generate request for each article # Extract article links from the Author page and generate request for each try: print "DBG:: l2_parse_author_page: Extracting poem links from Author page" articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract() articles_links = [self.domain_name+x for x in articles] for url in articles_links: # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): #print "Visiting Article: ", url yield scrapy.Request(url, callback=self.l3_parse_article_page) except: print "l2_parse_author_page: Nothing found in Author page!!!" print("ERROR: l2_parse_author_page: Unexpected error:", sys.exc_info()[0]) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print ("DBG:: Error in %s on line %d" % (fname, lineno))