def processQueue(): socket.setdefaulttimeout(35) socks.socket.setdefaulttimeout(35) dbConnection = psycopg2.connect(dbConnectionString) if not TorHandler.isQueueEmpty(dbConnection): currentNode = TorHandler.getNextNodeFromQueue(dbConnection) print "New node from queue:", currentNode.fingerprint, currentNode.nickname, currentNode.address testTorNode(dbConnection, currentNode) time.sleep(5) else: msg = "Queue empty, filling queue" logMessage(msg, "queue-fill") print msg socket.socket = socks._orgsocket TorHandler.fillQueue(dbConnection) logMessage("Queue filled", "queue-fill-success")
if currentNode.hasFTP and timeouts < 2: # test FTP try: print "Node supports FTP, using honey connection" testFTP(currentNode, "192.168.1.3") currentNode.madeFTPLogin = True except (socket.error, socks.Socks5Error, socks.GeneralProxyError, socks.ProxyError), e: print "problems with allocating FTP socket", e.message, traceback.format_exc() if "TTL expired" in e.message or "timed out" in e.message: timeouts = timeouts + 1 else: raise except (socket.error, socks.Socks5Error, socks.GeneralProxyError, socks.ProxyError), e: if "TTL expired" in e.message or "timed out" in e.message: print "node seems offline or broken, removing from queue..." TorHandler.removeNodeFromQueue(dbConnection, currentNode) except Exception, e: logMsg = "unknown exception:\nfingerprint: " + currentNode.fingerprint + "\nexception: " + e.message + "\nstacktrace:\n" + traceback.format_exc() logMessage(logMsg, "exception") finally: # Close the connection to the Tor network TorHandler.saveChangesToDB(dbConnection, currentNode) print "terminating tor process (if any)" try: tor_process.terminate() time.sleep(1) tor_process.kill() except Exception, e: print "there is no tor process (or spoon)" print "done with", currentNode.fingerprint, currentNode.nickname, "\n"
def main(): # TODO: Add command line switch to set db server? Or config file? dbserv = couchdb.Server('http://localhost:5984/') # CouchDB connection and db creation try: db = dbserv['sites'] except socket.error as e: print("[E] Could not connect to the database!") if DEBUG: print("Error message: {0}".format(e)) sys.exit( "Please make sure that the database has been started, and try again" ) except couchdb.http.ResourceNotFound: if DEBUG: print("[I] Building initial database...") db = dbserv.create('sites') # start the Tor process handler = TorHandler.TorHandler() if not handler.start_tor(): print( "[E] There was an error launching Tor. It may already be running.") if not handler.start_controller(): print("[E] Could not connect to control port!") sys.exit("Please kill all running Tor instances and try again") domains = queue.Queue(0) url = check_http(args.url) domains.put(url) # Prints endpoint information if debugging is enabled if DEBUG: print(handler.check_endpoint()) print("\nScraping for .ONION domains:\n") # Sets up DB entry for initial site being scraped if url not in db: prev_site = 'N/A - Starting Domain' current_time = datetime.datetime.now() urldoc = DB_Structure(_id=check_http(url), url=check_http(url), ref='None', Discovered=current_time, LastAccessed=None, title='') urldoc.store(db) # Main scraping loop # Gathers domains into the database, and continues to scrape through each subsequent domain with concurrent.futures.ThreadPoolExecutor(max_workers=5) as e: while domains.qsize() > 0: scrape_array = [] if domains.qsize() > 4: for x in range(0, 5): scrape_array.append(domains.get()) else: for x in range(0, domains.qsize()): scrape_array.append(domains.get()) scraper = [ e.submit(scrape_site, x, domains, db, handler) for x in scrape_array ] # Hackish way to make the threads wait until the queue is populated again, or until all threads are done if domains.qsize() == 0: x = [ s.result for s in concurrent.futures.as_completed(scraper) ] print("\nScraping Complete.") if not handler.kill_tor(): print("[E] Error killing the Tor process! It may still be running.") else: print("\nTor Instance Killed.")