def crawl(urloc:str) -> (str,list): db = Database(PATH) parser = Parser() session = connect_to_tor() # select here to find if in db try: urlindb = db.isCrawled(urloc) if len(urlindb) > 0: # url already crawled del urlindb return urloc,[] except Exception as e: print(e) try: try: r = session.get(urloc,headers=TORBUNDLEHEADER,timeout=20) r.raise_for_status() except Exception as err: insert_data = { "protocol" : "Error", "url" : urloc, "data" : base64.b64encode(str(err).encode()), "lastvisit": int(time.time()), } try: db.insert(insert_data) except Exception as e: # if urloc in db dont crawl it again and return if "UNIQUE constraint failed" in str(e): # update val try: db.update(insert_data) except Exception as e: pass else: urls = parser.urlExtractor(urloc,r.text) protocol = urloc.split("://")[0] insert_data = { "protocol" : protocol, "url" : urloc, "data" : base64.b64encode(r.content), "lastvisit": int(time.time()), } try: db.insert(insert_data) except Exception as e: # if urloc in db dont crawl it again and return if "UNIQUE constraint failed" in str(e): # update val try: db.update(insert_data) except Exception as e: pass retUrls = [] for key, value in urls.items(): if key == "http" or key == "https" : # crawl only http protocol for url in urls[key]: tld = parser.tldExtractor(url) # crawl only onion sites if tld == "onion": retUrls.append(url) return urloc, retUrls except Exception as e: return urloc,[]