def getBrokenlinks(url): broken_links = 0 threads = [] try: soup = url.getsoup() except WebcredError as e: raise WebcredError(e.message) except: raise WebcredError('Url is broken') for link in soup.find_all('a', href=True): uri = link.get('href') # TODO should it inlude inner links as well? if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): t = MyThread(Method='funcBrokenllinks', Name='brokenlinks', Url=uri) t.start() threads.append(t) for t in threads: # pdb.set_trace() t.join() # t.freemem() if t.getResult(): broken_links += 1 return broken_links
def getImgratio(url): total_img_size = 0 threads = [] try: text_size = url.getsize() except WebcredError as e: return e.message soup = url.getsoup() # total_img_size of images for link in soup.find_all('img', src=True): uri = link.get('src', None) if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): try: uri = Urlattributes(uri) t = MyThread(Method='funcImgratio', Name='Imgratio', Url=uri) t.start() threads.append(t) except WebcredError as e: # even if particular image is not accessible, we don't mind it pass for t in threads: t.join() t.freemem() size = t.getResult() if isinstance(size, int): total_img_size += size # print total_img_size try: total_size = total_img_size + text_size ratio = float(text_size) / total_size # print ratio, text_size, total_size except ValueError: raise WebcredError('Error in fetching images') return ratio