import unittest, os, time from multiprocessing import Queue import scrape_logger import scrapeKL_gui level = "WARNING" #level = "INFO" #level = "DEBUG" logger = scrape_logger.setup_logger(level) storage_directory = "scrapes" class Test(unittest.TestCase): def test_scrapeThread(self): company_names = { 2048: "talenom", 1102: "cramo", 1091: "sanoma", 1196: "afarak group" } #company_names = None # For scraping every company showProgress = False queue = Queue() time0 = time.time() qThread = scrapeKL_gui.scrapeThread(storage_directory, company_names, showProgress, queue) qThread.start() qThread.wait()
import unittest import scrape_logger import speed level = "WARNING" #level = "INFO" #level = "DEBUG" logger_root = scrape_logger.setup_logger(level) logger_speed = scrape_logger.setup_logger(level, name="speed") class Test(unittest.TestCase): def test_speed(self): # smoketest speed.run_speedtest(1) if __name__ == '__main__': unittest.main()
scrapeKL.print_calculations(self.filename, c_id, c_name) # TODO: Filter and organize functions does not work atm elif sender.text() == self.filter_str: scrapeKL.filter_companies(self.filename) elif sender.text() == self.organize_str: scrapeKL.organize_companies(self.filename) else: raise ScrapeGuiException( 'Unexpected "sender.text()": [{}]'.format( sender.text())) else: logger.info("No file selected") if __name__ == '__main__': logger = scrape_logger.setup_logger("DEBUG") # Storage storage_directory = "scrapes" if not os.path.isdir(storage_directory): os.makedirs(storage_directory) logger.debug("storage-folder created: [{}]".format(storage_directory)) # GUI app = QApplication(sys.argv) Window = Window(storage_directory) Window.show() sys.exit(app.exec_())
def main(arguments): # Logging if arguments["--debug"]: level = "DEBUG" else: level = "INFO" logger = scrape_logger.setup_logger(level) logger.debug(arguments) # Storage storage_directory = "scrapes" if not os.path.isdir(storage_directory): os.makedirs(storage_directory) logger.debug("storage-folder created: [{}]".format(storage_directory)) # Shared arguments if arguments["<file>"]: filename = storage_directory + "\\" + arguments["<file>"] else: filename = storage.get_latest_metrics_filename(storage_directory) c_name = arguments["--name"] c_id_list_in = arguments["--id"] c_id_list = [] if c_id_list_in: for c_id in c_id_list_in: try: c_id_list.append(int(c_id)) except ValueError: raise ScrapeKLException( "Id {} is not an integer.".format(c_id)) logger.debug("c_id_list: {}; c_name: {}".format(c_id_list, c_name)) # Function calling if arguments["scrape"]: company_names = None if arguments["--id"]: company_names = {} for c_id in arguments["--id"]: try: company_names[int(c_id)] = None except ValueError: raise ScrapeKLException( "Id {} is not an integer.".format(c_id)) logger.debug("company_names to scrape: {}".format(company_names)) time0 = time.time() scrape_companies(storage_directory, company_names) print("Scraping took: {:.2f} s".format(time.time() - time0)) elif arguments["names"]: if arguments["<file>"]: names_filename = arguments["<file>"] print_all_names(names_filename=names_filename) else: print_all_names(storage_directory=storage_directory) elif arguments["metrics"]: print_metrics(filename, c_id_list, c_name) elif arguments["collection"]: print_collection(filename, c_id_list, c_name) elif arguments["filtered"]: print_filtered(filename, c_id_list, c_name) elif arguments["passed"]: print_passed_names(filename) elif arguments["list_files"]: all_filenames = os.listdir(storage_directory) for f in sorted(all_filenames): if f.endswith(".json"): print(f) elif arguments["speed"]: times = arguments["<times>"] if times: try: times = int(times) except ValueError: raise ScrapeKLException( "Times {} is not an integer.".format(times)) assert times > 0 else: logger.info("Using default: times = 5") times = 5 """ The speed testing needs its own logger, so the print is clean. The speed logger level is gotten from the user input, and the scrape logger, "root", level is set to WARNING. """ _speed_logger = scrape_logger.setup_logger(logger.level, "speed") scrape_logger.set_logger_level(logger, "WARNING") speed.run_speedtest(times)
import requests, re, logging, traceback, time from bs4 import BeautifulSoup from datetime import date from multiprocessing import Process, Queue import scrape_logger #logger = logging.getLogger('root') level = "INFO" #level = "DEBUG" logger = scrape_logger.setup_logger(level, "scraping") scrape_logger.set_logger_level(logger, level) url_basic = "http://www.kauppalehti.fi/5/i/porssi/" osingot_url = url_basic + "osingot/osinkohistoria.jsp" osingot_yritys_url = url_basic + "osingot/osinkohistoria.jsp?klid={}" kurssi_url = url_basic + "porssikurssit/osake/index.jsp?klid={}" kurssi_tulostiedot_url = url_basic + \ "porssikurssit/osake/tulostiedot.jsp?klid={}" date_format = "%Y-%m-%d" # YYYY-MM-DD date_short_format = "%y-%m-%d" # YY-MM-DD datetime_format = "%y-%m-%d_%H-%M-%S" # YY-MM-DD_HH-MM-SS: for filename date_pattern_0 = re.compile("^\d{4}\-\d{2}\-\d{2}$") # YYYY-MM-DD date_pattern_1 = re.compile("^\d{2}\.\d{2}\.\d{4}$") # DD.MM.YYYY date_pattern_2 = re.compile("^\d{2}/\d{2}$") # MM/YY class ScrapeException(Exception): pass
whole_run_time = time.time() - time0 avg_time = whole_run_time / times compared = avg_time - old_avg_time if old_avg_time: compared_percent = 100 * compared / old_avg_time else: compared_percent = 0 logger.info( "- Scraping took:\t\t{:6.2f} s\n".format(whole_run_time) + \ "Average time per scraping:\t{:6.2f} s\n".format(avg_time) + \ "Compared to old average:\t{:+6.2f} s --> {:+.1f} %".format( compared, compared_percent ) ) return whole_run_time if __name__ == '__main__': logger_root = scrape_logger.setup_logger("WARNING") logger = scrape_logger.setup_logger(name="speed") times = 5 # default if len(sys.argv) == 2: try: times = int(sys.argv[1]) except ValueError: pass run_speedtest(times)