def crawl(self, currentURL): #uses the python module urlparse to parse the URL into segments scheme, domain, filePath, params, query, fragment = urlparse( currentURL) ipAddr = socket.getaddrinfo(domain, 443) #RETRIEVE SSL CERTIFICATES (IF APPLICABLE) try: #uses the javax.net.ssl.* and java.security.cert.* libraries to obtain certificate factory = HttpsURLConnection.getDefaultSSLSocketFactory() tmpSocket = factory.createSocket(domain, 443) tmpSocket.startHandshake() session = tmpSocket.getSession() domainCerts = session.getPeerCertificateChain() except SSLHandshakeException: #except thrown if the domain does not support SSL print 'javax.net.ssl.SSLHandshakeException with domain: ' + domain except ConnectException: print 'java.net.ConnectException with domain: ' + domain #RETRIEVE PAGE SOURCE pageSource = urllib.urlopen(currentURL).read() #PARSE PAGE SOURCE FOR LINKS myparser = MyParser(scheme, domain) myparser.parse(pageSource) myparser.sort_hyperlinks(depth) childLinks = myparser.get_hyperlinks() return ipAddr, domainCerts, pageSource, childLinks
def crawlDomain(self, seed): currentURL, depth = seed scheme, domain, filePath, params, query, fragment = urlparse(currentURL) #######check for regular traffic on port 80 #######think of a way to grab information on servers that host on non-standard ports. ipAddr = socket.getaddrinfo(domain, 443) #RETRIEVE SSL CERTIFICATES (IF APPLICABLE) try: #uses the javax.net.ssl.* and java.security.cert.* libraries to obtain certificate factory = HttpsURLConnection.getDefaultSSLSocketFactory() tmpSocket = factory.createSocket(domain, 443) tmpSocket.startHandshake() session = tmpSocket.getSession() domainCerts = session.getPeerCertificateChain() except SSLHandshakeException: #except thrown if the domain does not support SSL print 'javax.net.ssl.SSLHandshakeException with domain: ' + domain domainCerts = " " except ConnectException: print 'java.net.ConnectException with domain: ' + domain domainCerts = None tempdd = DomainDetail() #create a temporary DomainDetail object tempdd.setDomainName(domain) #set the domainName value in the DomainDetail object #stores the IP addresses obtain from the crawlDomain() function ipAddresses = [] ipAddrLen = len(ipAddr) for i in range (0, ipAddrLen): a, b, c, d, e = ipAddr[i] a, b = e oct1, sep, leftover = a.partition('.') oct2, sep, leftover = leftover.partition('.') oct3, sep, leftover = leftover.partition('.') oct4, sep, leftover = leftover.partition('.') ipAddresses.append(IPAddress(int(oct1), int(oct2), int(oct3), int(oct4))) tempdd.setIPAddresses(ipAddresses) print "IP Addresses for '" + domain + "' stored" #parses and stores certificate Information certArray = [] if(domainCerts != None): chainLength = len(domainCerts) for i in range(0, chainLength): certArray.append(Certificate()) certArray[i].setFullCertificate(pprint.pformat(domainCerts[i])) certArray[i].setIssuer(domainCerts[i].getIssuerDN().getName()) certArray[i].setHierarchy(i) certArray[i].setSubject(domainCerts[i].getSubjectDN().getName()) certArray[i].setValidFrom(domainCerts[i].getNotBefore()) certArray[i].setValidTill(domainCerts[i].getNotAfter()) certArray[i].setBasicConstraint(0) ###DONT HOW HOW TO DO### tempdd.setCertificates(certArray) print "certs for '" + domain + "' stored" return tempdd