def iscrawlable(page_url): try: rp = robotparser.RobotFileParser() parsedurl = urlparse.urlsplit(page_url) robotsurl = canonicalization.mycanonicalization(parsedurl.scheme+"://"+parsedurl.netloc + "/robots.txt") rp.set_url(robotsurl) rp.read() return rp.can_fetch("*", page_url) except IOError: return False
__author__ = 'Moorthy' import canonicalization print canonicalization.mycanonicalization("HTTP://www.Example.com/SomeFile.html") print canonicalization.mycanonicalization("http://www.example.com:80") print canonicalization.mycanonicalization("http://www.example.com/a/../c.html") print canonicalization.mycanonicalization("http://www.example.com/a.html#anything") print canonicalization.mycanonicalization("http://www.example.com//a.html") print canonicalization.mycanonicalization("http://www.example.com/c.html?a=10")
def downloadandgeturls(page_url, docid): returnval = {} urlfilenamelist = page_url.rsplit('/',1) urlfilename = urlfilenamelist[-1] parsedurl = urlparse.urlsplit(page_url) if isvaliddomain(parsedurl.netloc): try: page_url.decode('ascii') except UnicodeEncodeError: returnval['iserror'] = True returnval['ispoliteenabled'] = False returnval['errormsg'] = "Unicode Encode Error in URL" return returnval if iscrawlable(page_url): urllist = [] try: f = urllib.urlopen(page_url) except IOError: returnval['iserror'] = True returnval['ispoliteenabled'] = False returnval['errormsg'] = "URL failed to open." return returnval headers = f.info().headers if isvalidheader(headers): try: html_page = f.read() soup = BeautifulSoup(html_page) for link in soup.findAll('a'): urlvalue = link.get('href') if not (urlvalue is None): if urlvalue.startswith("http"): try: urllist.append(canonicalization.mycanonicalization(urlvalue)) except AttributeError: print "Not valid URL." except ValueError: print "Invalid IPv6 URL." else: try: if urlvalue.startswith("//"): finalurl = canonicalization.mycanonicalization(parsedurl.scheme+":"+urlvalue) elif urlvalue.startswith("/"): finalurl = canonicalization.mycanonicalization(parsedurl.scheme+"://"+parsedurl.netloc+"//"+urlvalue) else: new_page_url = page_url.replace(urlfilename, urlvalue) finalurl = canonicalization.mycanonicalization(new_page_url) if urlvalue not in restrictedurlvalue: urllist.append(finalurl) except AttributeError: print "Not valid URL." docdict = {} docdict['page_url'] = page_url docdict['raw_html'] = unicode(html_page, errors='ignore') docdict['outlinks'] = set(urllist) docdict['headerdata'] = headers texts = soup.findAll(text=True) visible_texts = filter(visible, texts) single_text = '\n'.join(visible_texts) if not isrelavantpage(single_text): returnval['iserror'] = True returnval['errormsg'] = "No keywords" returnval['ispoliteenabled'] = True return returnval docdict['clean_text'] = single_text fileutils.writedocument(docid, docdict) uniqueurllist = set(urllist) returnval['iserror'] = False returnval['urllist'] = uniqueurllist returnval['ispoliteenabled'] = True return returnval except IOError: print traceback.format_exc() returnval['iserror'] = True returnval['errormsg'] = "Invalid URL." returnval['ispoliteenabled'] = False return returnval else: returnval['iserror'] = True returnval['errormsg'] = "Header is invalid." returnval['ispoliteenabled'] = False return returnval else: returnval['iserror'] = True returnval['errormsg'] = "Not crawlable URL." returnval['ispoliteenabled'] = False return returnval else: returnval['iserror'] = True returnval['errormsg'] = "Domain is banned" returnval['ispoliteenabled'] = False return returnval