def main(config_file, restart): cparser = ConfigParser() cparser.read(config_file) config = Config(cparser) config.cache_server = get_cache_server(config, restart) crawler = Crawler(config, restart) crawler.start()
def main(config_file, restart): file_count = 1 #reads data.txt to check if it has data and moves it to a different file if os.path.exists("data.txt") and os.path.getsize("data.txt") > 2: #generate a new file while os.path.exists("records/data_record" + str(file_count) + ".txt"): file_count += 1 with open("data.txt", "r") as infile: with open("records/data_record" + str(file_count) + ".txt", "w") as outfile: outfile.write(infile.read()) #reads subdomains.txt to check if it has data and moves it to a different file if os.path.exists( "subdomains.txt") and os.path.getsize("subdomains.txt") > 2: #generate a new file with open("subdomains.txt", "r") as infile: with open("records/subdomains_record" + str(file_count) + ".txt", "w") as outfile: outfile.write(infile.read()) #create or overwrite subdomains.txt # with open("subdomains.txt", "w") as file_contents: # file_contents.write("{}") cparser = ConfigParser() cparser.read(config_file) config = Config(cparser) config.cache_server = get_cache_server(config, restart) crawler = Crawler(config, restart) crawler.start()
def main(config_file, restart): #opens up config.ini file & reads them cparser = ConfigParser() cparser.read(config_file) config = Config(cparser) config.cache_server = get_cache_server(config, restart) #establishing the crawler crawler = Crawler(config, restart) crawler.start()
def main(config_file, restart): cparser = ConfigParser() cparser.read(config_file) config = Config(cparser) config.cache_server = get_cache_server(config, restart) if restart: for filename in config.get_scraper_files(): if os.path.exists(filename): os.remove(filename) crawler = Crawler(config, restart) crawler.start()
def main(config_file, restart): if DELETE_DATA_FILES: for file in DATA_FILES: if os.path.exists(file): os.remove(file) if DELETE_LOG_FILES: for file in LOG_FILES: if os.path.exists(file): os.remove(file) cparser = ConfigParser() cparser.read(config_file) config = Config(cparser) config.cache_server = get_cache_server(config, restart) crawler = Crawler(config, restart) crawler.start()
def main(config_file, restart): print("main begin") cparser = ConfigParser() print("parser created") cparser.read(config_file) print("config file successfully read") config = Config(cparser) print("some other config") config.cache_server = get_cache_server(config, restart) print("cache server obtained") crawler = Crawler(config, restart) print("Crawler created") crawler.start()
def main(config_file, restart): crawler = None try: cparser = ConfigParser() cparser.read(config_file) global config config = Config(cparser) config.cache_server = get_cache_server(config, restart) crawler = Crawler(config, restart) crawler.start() except KeyboardInterrupt: if crawler: crawler.frontier.save.sync() print('Keyboard Interrupt Detected !!') finally: if crawler: crawler.frontier.close() print('Goodbye !!')
def main(config_file, restart): ### Saving the list of text words in a separate file just in case ### the program crashes and the list of words get gone ### opening and closing textlist.txt if it already exists ### this will overwrite and erase the previous content ### will fresh start only if --restart parameter given ### also adding the list of url's if restart: tmp = open('textlist.txt', 'w') tmp.close() tmp2 = open('urllist.txt', 'w') tmp2.close() cparser = ConfigParser() cparser.read(config_file) config = Config(cparser) config.cache_server = get_cache_server(config, restart) crawler = Crawler(config, restart) crawler.start()