class SeleniumAccess(object): """Analyze information of each scrapint of a job and contruct information for save""" logger = None config = None driver = None def __init__(self, config, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.config = config def open_selenium(self): """open driver for scraping""" self.logger.debug("Open Selenium") self.driver = webdriver.Remote(\ command_executor=self.config.get("urlSelenium"),\ desired_capabilities=DesiredCapabilities.CHROME) self.logger.debug("IS selenium open %r", self.driver != None) def close_selenium(self): """close driver for scraping""" if self.driver != None: self.driver.stop_client() self.driver.close() self.driver = None
def __init__(self, file_config, level_log): self.level_log = level_log self.logger = Logger(self.__class__.__name__, level_log).get() try: self.config = json.loads(open(file_config, "r").read()) self.mongodbaccess = MongoDBAccess(self.config, level_log) except IOError: self.logger.error("File Error: %s", file_config) self.config = {} self.mongodbaccess = MongoDBAccess({}, level_log) self.logger.info("Inicio: %s", datetime.datetime.now())
def test_logger_test(): """Test logger""" sys.path.insert(0, "../test") try: os.remove("log/test.log") except OSError: print "file don't exist" logger = Logger("test", "DEBUG").get() logger.error("Error") data = open("log/test.log", "r").read() assert " ERROR:log_namespace.test Error" in data
def __init__(self, config, levelLog): """Need a file where has got all parameters and level of Loggin""" self.logger = Logger(self.__class__.__name__, levelLog).get() self.logger.setLevel('INFO') try: self.logger.debug(config.get("url", "")) self._client = MongoClient(config.get("url", "")) self.db_access = self._client[config.get("nameDB")] self.logger.info("-- INFO -- DATA BASE CONECT OK") except ConfigurationError: self.logger.error("ConfigurationErr") except ConnectionFailure: self.logger.error("ConnectionFailure") except OperationFailure: self.logger.error("Authentication failure")
def __init__(self, level_log): """load bank holidays 2018 end year """ self.logger = Logger(self.__class__.__name__, level_log).get() self.bank_holidays.append([2018, 5, 7]) self.bank_holidays.append([2018, 6, 4]) self.bank_holidays.append([2018, 8, 6]) self.bank_holidays.append([2018, 10, 29]) self.bank_holidays.append([2018, 12, 25]) self.bank_holidays.append([2018, 12, 26]) self.bank_holidays.append([2018, 12, 27])
class CleanFlights(object): """clean Flights""" mongodbaccess = None logger = None def __init__(self, mongo_db_access, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.mongodbaccess = mongo_db_access self.logger.info("Inicio: %s", datetime.datetime.now()) def clean(self): """ clean Process """ self.logger.info("++INFO-- CLEAN FASE I") result = {"total":0} for vuelo in self.mongodbaccess.find("vuelos", {}): result = self.analize_each_flight(result, vuelo) return result def analize_each_flight(self, result, vuelo): """each flight analyze each rule""" apply(lambda rule: accumulate_dic(result, rule(vuelo)), self.create_all_rules()) result["total"] += 1 return result def create_all_rules(self): """ insert all rules created for run all""" return [self.rule_older_than_15days] def rule_older_than_15days(self, elemento): """First Rule: move all flights from vuelos to vuelosOld older than 15 days """ date15 = datetime.datetime.now()-datetime.timedelta(days=15) deleted = 0 inserted_old = 0 if elemento.get("dateDirect", datetime.datetime) < date15: if self.mongodbaccess.insert("vuelosOld", elemento) is not None: inserted_old = 1 self.logger.error("Error vuelo not insert backup but delete %s", elemento) self.mongodbaccess.delete_one("vuelos", {"_id":elemento.get("_id")}) deleted = 1 return {"deleted":deleted, "inserted_old":inserted_old}
def __init__(self, mongo_db_access, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.mongodbaccess = mongo_db_access self.logger.info("Inicio: %s", datetime.datetime.now())
def __init__(self, config, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.config = config
def __init__(self, config, mongo_db_access, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.mongodbaccess = mongo_db_access self.seleniumaccess = SeleniumAccess(config, level_log) self.holidays = Holidays(level_log) self.logger.info("Inicio: %s", datetime.datetime.now())
class FindFlights(object): """find Flight""" seleniumaccess = None mongodbaccess = None logger = None holidays = None def __init__(self, config, mongo_db_access, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.mongodbaccess = mongo_db_access self.seleniumaccess = SeleniumAccess(config, level_log) self.holidays = Holidays(level_log) self.logger.info("Inicio: %s", datetime.datetime.now()) def get_flights(self, urls): """ doc to explain """ self.logger.info("Process each url") result = {"save": 0, "warn": 0, "error": 0} self.seleniumaccess.open_selenium() driver = self.seleniumaccess.driver time.sleep(1) driver.get("http://www.google.com") time.sleep(1) for url in urls: accumulate_dic(result, self.url_to_flight(url, driver)) self.seleniumaccess.close_selenium() return result def url_to_flight(self, url, driver): """process each url""" driver.get(url.get("url", "http://google.es")) try: precio_string = driver.find_element_by_class_name( "gws-flights-results__price").text #navigate #driver.find_element_by_class_name("gws-flights-results__more").click() #driver.find_element_by_xpath("//*[contains(text(), 'SELECT FLIGHT')]").click() if url.get("type", "") == "o": type_flight = driver\ .find_element_by_class_name("gws-flights-form__menu-label").text else: type_flight = driver\ .find_element_by_class_name("gws-flights-results__price-annotation").text url_insert = \ {"dBusqueda":datetime.datetime.now(), \ "precio":float(precio_string[1:].replace(".", "").replace(", ", ".")), \ "type": type_flight,\ "horaS":driver.find_element_by_class_name("gws-flights-results__times").text,\ "horaLl":"",\ "company":driver.find_element_by_class_name("gws-flights-results__carriers").text,\ "duracion":driver.find_element_by_class_name("gws-flights-results__duration").text, \ "escalas":driver \ .find_element_by_class_name("gws-flights-results__itinerary-stops").text, \ "from":url.get("from", "XXX"), \ "to":url.get("to", "XXX"), \ "dateDirect":url.get("dateDirect", "XXX"), \ "dateReturn":url.get("dateReturn", "YYY"), \ "holidays": \ self.holidays.get_number_holidays(url.get("dateDirect", "XXX"), \ url.get("dateReturn", "YYY"))} self.logger.debug("Insert url elemento: %s", url_insert) self.mongodbaccess.insert("vuelos", url_insert) self.mongodbaccess.delete_one("urls", {"url": url.get("url", "")}) print "from: {0}, to: {1}, dateDirect: {2}, dateReturn: {3}, price: {4}".format(\ url_insert["from"], url_insert["to"], \ url_insert["dateDirect"].strftime("%Y-%m-%d"), \ url_insert["dateReturn"].strftime("%Y-%m-%d"), \ url_insert["precio"]) except StaleElementReferenceException as error_ref: print "****************************" print url print error_ref time.sleep(1) return {"save": 0, "warn": 0, "error": 1} except NoSuchElementException as error_no_such: print "****************************" print url print error_no_such time.sleep(1) return {"save": 0, "warn": 1, "error": 0} except TimeoutException as error_time_out: print "-- ERROR -- TimeOut *****************" print "****************************" print url print error_time_out return {"save": 0, "warn": 0, "error": 1} return {"save": 1, "warn": 0, "error": 0}
class MongoDBAccess(object): """Class to access to MongoDB allow access and review connections""" db_access = None _client = None def __init__(self, config, levelLog): """Need a file where has got all parameters and level of Loggin""" self.logger = Logger(self.__class__.__name__, levelLog).get() self.logger.setLevel('INFO') try: self.logger.debug(config.get("url", "")) self._client = MongoClient(config.get("url", "")) self.db_access = self._client[config.get("nameDB")] self.logger.info("-- INFO -- DATA BASE CONECT OK") except ConfigurationError: self.logger.error("ConfigurationErr") except ConnectionFailure: self.logger.error("ConnectionFailure") except OperationFailure: self.logger.error("Authentication failure") def status(self): """Determinate True is connect or False if is not connect""" if self._client is None: return False try: self.logger.debug(self._client.server_info()) return True except ConnectionFailure: self.logger.error("ConnectionFailure") return False except OperationFailure: self.logger.error("Authentication failure") return False def find_one(self, collection, query, sort=None): """Find one element only return a json element""" if self.status(): sort = None if sort is None else sort.items() self.logger.info("Access to collection: %s, query %s", collection, query) return self.db_access[collection].find_one(query, sort=sort) else: self.logger.error("Database Not INIT Find_one") return None def find(self, collection, query, sort=None, limite=None): """Find several elements is a cursor, atention for line in cursor is better""" if self.status(): self.logger.info("Access to collection Multi: %s, query: %s, sort: %s, limit: %s",\ collection, query, sort, limite) limite = 0 if limite is None else limite sort = None if sort is None else sort.items() return self.db_access[collection].find(query, sort=sort, limit=limite) else: self.logger.error("Database Not INIT Find") return None def update_one(self, collection, query, change, is_set="set"): """Update One return status of update""" if self.status(): self.logger.info("Modify collection: %s, query: %s, modify: %s, set: %s",\ collection, query, change, is_set) setdollar = "$" + is_set return self.db_access[collection].update_one( query, {setdollar: change}) else: self.logger.error("Database Not INIT Update_one") return None def update_many(self, collection, query, change, is_set="set"): """Update Many return status of update""" if self.status(): self.logger.info("Modify Many collection: %s, query: %s, modify: %s, set: %s",\ collection, query, change, is_set) setdollar = "$" + is_set return self.db_access[collection].update_many( query, {setdollar: change}) else: self.logger.error("Database Not INIT Update_one") return None def insert(self, collection, element): """Insert return status of insert""" if self.status(): self.logger.debug("Insert collection: %s, data: %s", collection, element) #control duplicated try: return self.db_access[collection].insert(element) except DuplicateKeyError: return None else: self.logger.error("Database Not INIT Find") return None def delete_one(self, collection, element): """delete One return status of delete""" if self.status(): self.logger.info("Remove collection: %s, data: %s", collection, element) return self.db_access[collection].delete_one(element) else: self.logger.error("Database Not INIT Find") return None def delete_many(self, collection, element): """delete return status of delete""" if self.status(): self.logger.info("Remove collection: %s, data: %s", collection, element) return self.db_access[collection].delete_many(element) else: self.logger.error("Database Not INIT Find") return None def aggregate(self, collection, element): """delete return status of delete""" if self.status(): self.logger.info("Aggregate collection: %s, data: %s", collection, element) return self.db_access[collection].aggregate(element) else: self.logger.error("Database Not INIT Find") return None def drop(self, collection): """Drop a collection return status of drop""" if self.status(): self.logger.info("Drop collection: %s", collection) return self.db_access[collection].drop() else: self.logger.error("Database Not INIT Find") return None
class Vuelos(object): """find Flight""" level_log = None config = None mongodbaccess = None logger = None def __init__(self, file_config, level_log): self.level_log = level_log self.logger = Logger(self.__class__.__name__, level_log).get() try: self.config = json.loads(open(file_config, "r").read()) self.mongodbaccess = MongoDBAccess(self.config, level_log) except IOError: self.logger.error("File Error: %s", file_config) self.config = {} self.mongodbaccess = MongoDBAccess({}, level_log) self.logger.info("Inicio: %s", datetime.datetime.now()) def ejecutar(self, nivel): """ run load process """ print "++ INFO ++ MODULO PRINCIPAL MODO DE EJECUCION: {0}".format( nivel) if nivel == "1": print "-- INFO -- MODO 1 duro ejecuta y limpia los datos del dia" #proceso duro vaciamos informacion y empezamos print "++ INFO ++ Vaciamos informacion del dia" print "-- INFO -- dia: {0}".format(today()) borrados = self.vaciar_dia() print "-- INFO -- vaciamos informacion -- Vuelos borrados del dia: {0}"\ .format(borrados.deleted_count) urls = BuildUrls(self.mongodbaccess, self.level_log).build_urls() print "-- INFO -- construir urls -- numero de URLS: {0}".format( urls) else: print "-- INFO -- MODO 0 suave solo si hay datos que ejecutar" #proceso soft miramos si hay algo que procesar #si no hay nada que procesar o el dia no se ha ejecutado. if self.return_urls().count() == 0: #no hay nada que ejecutar if self.find_last_day() < today(): # ultimo dia es anterior a hoy a las 12... no se ha procesado print "++ WARN ++ 1.1 PRIMERA VEZ DEL DIA creamos las URLS y seguimos" urls = BuildUrls(self.mongodbaccess, self.level_log).build_urls() print "-- INFO -- construir urls -- numero de URLS: {0}".format( urls) else: # ultimo dia posterior hoy a las 12... esta todo Ok print "++ WARN ++ 1.2 SE HA PROCESADO TODO Y NO HAY NADA QUE HACER" else: if self.find_last_day() < today(): # prblemas en el paraiso ayer la cosa no fue bien. Reiniciamos y procesamos print "** ERROR ** 2.1 AYER NO SE EJECUTARON TODOS LOS VUELOS" print "** ERROR ** vuelos pendientes {0}".format( self.return_urls().count()) self.logger.error("AYER no se ejecutaron todos los vuelos") urls = BuildUrls(self.mongodbaccess, self.level_log).build_urls() print "-- INFO -- construir urls -- numero de URLS: {0}".format( urls) else: #hay cosas que ejecutar print "++ WARN ++ 2.2 HA HABIDO UNA CANCELACION y el "\ +"SISTEMA SIGUE DESDE ESE PUNTO" print "++ WARN ++ vuelos pendientes {0}".format( self.return_urls().count()) self.logger.error( "Ha habido una cancelacion y se sigue desde ese punto") result = FindFlights(self.config, self.mongodbaccess, self.level_log)\ .get_flights(self.return_urls()) print "++ INFO -- TOTAL PROCESO, Save: {0}".format( result.get("save", 0)) print "++ INFO -- TOTAL PROCESO, errores sin Informacion: {0}".format( result.get("warn", 0)) print "++ INFO -- TOTAL PROCESO, errores NO ENCONTRADO: {0}".format( result.get("error", 0)) def vaciar_dia(self): """ delete all info of day """ return self.mongodbaccess.delete_many("vuelos", {"dBusqueda": { "$gt": today() }}) def return_urls(self): """ doc to explain """ return self.mongodbaccess.find("urls", {}) def find_last_day(self): """ doc to explain """ print "++ INFO ++ find_last_day" if self.mongodbaccess.find_one("vuelos", {}, sort={"dBusqueda": -1 }) is None: return datetime.datetime(2000, 01, 01) else: return self.mongodbaccess.find_one("vuelos", {}, sort={"dBusqueda":-1})\ .get("dBusqueda", "")
def __init__(self, mongodbaccess, level_log): """ Build urls to review """ self.logger = Logger(self.__class__.__name__, level_log).get() self.holidays = Holidays(level_log) self.mongodbaccess = mongodbaccess
class BuildUrls(object): """ process to calculate holidays """ logger = None holidays = None mongodbaccess = None def __init__(self, mongodbaccess, level_log): """ Build urls to review """ self.logger = Logger(self.__class__.__name__, level_log).get() self.holidays = Holidays(level_log) self.mongodbaccess = mongodbaccess def build_urls(self): """Build all urls""" deleted = self.mongodbaccess.delete_many("urls", {}) self.logger.warn("-- INFO -- URLS deleted: %d",\ deleted.deleted_count) return sum( self.build_urls_one_search(search) for search in self.find_elements_search()) def find_elements_search(self): """ doc to explain """ return self.mongodbaccess.find("busquedas", {"activa": True}) def build_urls_one_search(self, search): """each element of busqueda create urls""" self.logger.warn("new element %s", search) sum_per_search = 0 date_direct = search.get("fromDateInit",\ datetime.datetime.now()+datetime.timedelta(days=1)) while date_direct <= search.get("fromDateEnd", datetime.datetime.now()): if search.get("type", "o") == "o": sum_per_search += \ self.review_save_url_onetrip(search, date_direct) else: sum_per_search += self.process_date_return(search, date_direct) date_direct = date_direct + datetime.timedelta(days=1) if sum_per_search == 0: self.logger.warn( "-- INFO -- desactivate search, no generate urls: %s", search) self.mongodbaccess.update_one("busquedas", {"_id": search["_id"]}, {"activa": False}) return sum_per_search def process_date_return(self, search, date_direct): """fixed date init find all posibilities date return""" suma = 0 date_return = search.get( "toDateInit", datetime.datetime.now() + datetime.timedelta(days=1)) while date_return <= search.get("toDateEnd", datetime.datetime.now()): suma += self.review_save_url_return(search, date_direct, date_return) date_return = date_return + datetime.timedelta(days=1) return suma def review_save_url_onetrip(self, search, date_direct): """review for return flights if can save""" if (date_direct > datetime.datetime.now()) and \ (self.holidays.get_number_holidays(date_direct, date_direct)\ >= search.get("minHolidays", 0)): return self.save_url(create_url(search, date_direct, date_direct)) return 0 def review_save_url_return(self, search, date_direct, date_return): """review for return flights if can save""" if (date_direct <= date_return) and (date_direct > datetime.datetime.now()): dif = (date_return - date_direct) + datetime.timedelta(days=1) if dif <= datetime.timedelta(days=search.get("maxDays", 0)) and\ dif >= datetime.timedelta(days=search.get("minDays", 0)) and\ (self.holidays.get_number_holidays(date_direct, date_return)\ >= search.get("minHolidays", 0)): return self.save_url( create_url(search, date_direct, date_return)) return 0 def save_url(self, urls): """ doc to explain """ if self.mongodbaccess.find_one( "urls", {"url": urls.get("url", "ERROR")}) is None: self.mongodbaccess.insert("urls", urls) return 1 return 0