def setFiles(self, html, url): self.setHTML(html) internalList = [] for files in self.getHTML().findAll('a', href=True): linkToFile = files['href'] if UrlUtils.externalLink(url, linkToFile) and UrlUtils.containsHTTP(linkToFile): self._externals.append(linkToFile) else: if UrlUtils.containsHTTP(linkToFile) is False: linkToFile = UrlUtils.assertSiteWithFile(url, linkToFile) if ExtensionsFile.hasExtension(linkToFile) and not UrlUtils.externalLink(url, linkToFile): internalList.append(linkToFile) self.filterFiles(internalList)
def __setDomain(self, domain): tmp = domain if UrlUtils.containsHTTP(domain): tmp = tmp.replace('http://', '').replace('https://', '') if UrlUtils.containsWWW(tmp): tmp = tmp.replace('www.', '') self._domain = tmp.split('/')[0]
def getIp(self, url): if UrlUtils.containsHTTP(url): url = url.replace('http://', '') url = url.replace('https://', '') url = sub('/.*', '', url) return str(gethostbyname(url).strip()) else: return gethostbyname(url).strip()
def setImages(self, html, url): self.setHTML(html) imgList = self.getHTML().findAll('img') images = [] for img in imgList: linkToImg = img.get('src') if UrlUtils.containsHTTP(img.get('src')) is False: linkToImg = UrlUtils.assertSiteWithFile(url, img.get('src')) images.append(linkToImg.strip()) self.__allImages(images)
def __setIp(self, url): try: if UrlUtils.containsHTTP(url): url = url.replace('http://', '') url = url.replace('https://', '') url = sub('/.*', '', url) self.__listOfIps(gethostbyname(url)) else: self.__listOfIps(gethostbyname(url)) except: pass
def searchAndAddLinksFromMain(self, html, url): urls = [] for link in html.findAll('a', href=True): page = link['href'] try: if page[0] != '#' and url not in page and not UrlUtils.containsHTTP( page): urls.append('http://' + url + '/' + page) else: if type(page) is str: if self.pageOrExternal(page, url): urls.append(page) elif type(page) is list: for string in list: if self.pageOrExternal(string, url): urls.append(string) except: continue list(set(urls)) return urls