def get_page(url): try: parse_dict=urlparse(url) if parse_dict.scheme: base_url=parse_dict.scheme + '://'+ parse_dict.netloc robot_url=urljoin(base_url,'/robots.txt') parse_robot.user_agent='jocrawler 1.1' robot_parse=parse_robot.RobotFileParserLookalike() robot_parse.set_url(robot_url) robot_parse.read() if not robot_parse.can_fetch('jobcrawler 1.1', url): print "This seed page can not be crawled based on robot.txt" return soup(''),'' else: try: useragent='jobcrawler 1.1' #headers={'User-Agent':useragent} request=Request(url) request.add_header('User-Agent', useragent) response=urlopen(request) if response.info().type not in ['text/html']: return soup(''),'' the_page=response.read() return soup(the_page),url except URLError as connection_error: print "Failed to reach server" print 'Error code:',connection_error.code except HTTPError as _400_to_500: print "The serve coudnot fulfill the request" print 'Error code:',check_response(_400_to_500.code) else: print 'EVERYTHING IS FINE' except URLError as connection_error: print 'FAILED TO REACH SERVER::'+url return soup(''),'' except: print 'Check url again'+ url return soup(''),'' # print 'Error from fetchpg' #we will write seperate models that will fetch urls fetch and insert into models #print get_page('http://www.facebook.com/recover/initiate')
def get_page(url): try: parse_dict=urlparse(url) if parse_dict.scheme: base_url=parse_dict.scheme + '://'+ parse_dict.netloc robot_url=urljoin(base_url,'/robots.txt') parse_robot.user_agent='jooble 1.1(http://about.me/jooble)' robot_parse=parse_robot.RobotFileParserLookalike() robot_parse.set_url(robot_url) robot_parse.read() if not robot_parse.can_fetch('jooble1.1 http://about.me/jooble', url): print "This seed page can not be crawled based on robot.txt" db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_excluded_urls":1}},safe=True) return soup('','lxml'),'' else: #counting number of robot.txt accepted db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_robotstxt_request":1}},safe=True) # This is where we count the number of urls that did not allow us not to crawl useragent='jobcrawler 1.1 http://about.me/jooble' #headers={'User-Agent':useragent} ##quote(url, safe="%/:=&?~#+!$,;'@()*[]" does not really work well request=Request(url) request.add_header('User-Agent', useragent) try:response=urlopen(request) except URLError: response=urlopen(quote(url, safe="%/:=&?~#+!$,;'@()*[]" )) #updating the database on http request made if response: db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_Http_Request":1}},safe=True) else: db.crawler_web_statistic.update({"_id":ObjectId("517dc20440ade61b20becb7d")},{"$inc":{"Number_of_Http_Request":1}},safe=True) if response.info().type not in ['text/html']: return soup('','lxml'),'' the_page=response.read() return soup(the_page,'lxml'),url except URLError as connection_error: print 'FAILED TO REACH SERVER::'+url, db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Url_errors":1}},safe=True) db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)}) return soup('','lxml'),'' except HTTPError as _400_to_500: print "The server coudnot fulfill the request" print 'Error code:',check_response(_400_to_500.code),_400_to_500.reason db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"from_400_500":1}},safe=True) db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)}) return soup('','lxml'),'' except BadStatusLine: print "BadStatusline...................Status Code is unknown" db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)}) db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Bad_status_line":1}},safe=True) return soup('','lxml'),'' except socket.timeout: print "SocketTimeout...................Fail" db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Socket_time_out":1}},safe=True) return soup('','lxml'),'' except: print 'Check url again ',traceback.print_exc() db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"Ungrouped":1}},safe=True) db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)}) return soup('','lxml'),'' else: db.crawler_http_status_errors.update({"_id":ObjectId("5180bfa440ade62017d1120c")},{"$inc":{"ok_200":1}},safe=True) print 'ok'