def __init__(self, starting_url): '''Initialize to set urlContentRetrieve object with startingURL. Keyword arguments: starting_url -- URL to start crawling. ''' # Setup Log self.setup_log() # Define url content retrieve to use self.url_content_retrieve = UrlContentRetrieve(starting_url)
class HttpLinksCollector: '''Class to manage links from url. Created on 27/09/2012 @author: Ricardo García Fernández @mail: [email protected] ''' def __init__(self, starting_url): '''Initialize to set urlContentRetrieve object with startingURL. Keyword arguments: starting_url -- URL to start crawling. ''' # Setup Log self.setup_log() # Define url content retrieve to use self.url_content_retrieve = UrlContentRetrieve(starting_url) def retrieve_links(self, target_url, depth=1, level=1): ''' Retrieve links from url content until defined depth organized in levels. Keyword arguments: target_url -- URL to analyze content and retrive links. depth -- Depth of links to analyze. level -- Level in which start to analyze. ''' # Define ScrapItem to generate json file # scrap_item = ScrapItem() links = {} if depth >= level: soup_code = self.url_content_retrieve.url_content(target_url) if soup_code: formatted_links = \ self.url_content_retrieve.\ retrieve_formatted_links(soup_code) for link in formatted_links : self.logger.info(self.print_depth(level) + " " + link) try: sublinks = \ self.retrieve_links(link, depth, level + 1) links[link] = sublinks except ValueError, value_error: # Invalid URL self.logger.error("URL is not correct:\t" + link + \ "\nException:\t"\ + str(value_error)\ + "\nStack trace:\t" + \ traceback.format_exc()) return links