Пример #1
0
 def load_crawler_configuration(self, path):
     files = load_config(path)
     try:
         result = load_yaml(f)
     except Exception as e:
         logging.error("Error {} ".format(str(e)))
     return result
Пример #2
0
def generate_file(filename, data):
    filename = get_path(filename)
    try:
        with open("{}".format(filename), "w+") as f:
            f.write(data)
            f.close()
            return True
    except Exception as e:
        logging.error(str(e))
        return False
Пример #3
0
def create_confdir(CONF_PATH=CONF_PATH,
                   DUMP_PATH=DUMP_LOCATION,
                   HTML_PATH=HTML_LOCATION):
    CONF_DIR = [CONF_PATH, DUMP_LOCATION, HTML_LOCATION]
    for path in CONF_DIR:
        check = create_dir(path)
    if not check:
        logging.error("Creating Directory Failed")
        return
    return True
Пример #4
0
def load_config(path):
    if os.path.exists(path):
        if os.path.isdir(path):
            files_ = get_all(path)
            return files_
        else:
            return [path]
    else:
        logging.error("File/Folder does not exists !")
        return False
Пример #5
0
 def __init__(self, **kwargs):
     self.base_url = None
     self.company_name = None
     self.product_list = list()
     self.currency = None
     if kwargs:
         for key, value in kwargs.items():
             try:
                 setattr(self, key, value)
             except:
                 logging.error("Can not set value")
                 pass
Пример #6
0
def get_loaded(driver, count=0):
    try:
        myElem = WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.XPATH, '//body')))
        return True
    except TimeoutException:
        driver.refresh()
        logging.error("Loading is too long")
        if count < 3:
            get_loaded(count=count + 1)
        else:
            return False
Пример #7
0
def load_crawler_configuration(path):
    files = load_config(path)
    f_yaml = list()
    for f in files:
        try:
            tmp = load_yaml(f)
        except Exception as e:
            logging.error("Error {} ".format(str(e)))
            continue
        else:
            f_yaml.append(tmp)
    return f_yaml
Пример #8
0
 def get_element(self, elements):
     driver = self.driver
     to_element = None
     for locator_, value in elements.items():
         delay = 3
         type_ = elementFilterTool[locator_]
         try:
             myElem = WebDriverWait(driver, delay).until(
                 EC.presence_of_element_located((type_, value)))
         except TimeoutException:
             logging.error("Loading took too much time!")
         to_element = driver.find_element(type_, value)
     return to_element
Пример #9
0
def run(path=CONF_PATH,
        force_headless=False,
        force_dump=True,
        dump_to_json=False,
        dump_location=DUMP_LOCATION):
    crawler_result = list()
    configs = load_crawler_configuration(path)
    failure = {"send": list(), "scrape": list()}
    for datas in configs:
        result = list()
        result_ = {"company": None, "data": list()}
        write_to_json = {"company": None, "data": list()}
        cfg = datas[0]
        cfg = flatten_dictionaries(cfg['config'])
        cfg['company_name'] = cfg.pop('name')
        product_details = {}
        for row in datas:
            if 'product' not in row:
                continue
            else:
                prods = row['product']
                prods_ = flatten_dictionaries(prods)
                d = ProductCrawler(cfg, is_headless=force_headless, **prods_)
                _company_details = d.company_detail
                d.config_worker()
                d.register_company()
                try:
                    dd = d.run()
                except Exception as e:
                    logging.error(str(e))
                else:
                    normalized_data = d.normalize(dd)
                    d.write_result(normalized_data)
                    for key, value in normalized_data.items():
                        if not value:
                            failure['scrape'].append(d.endpoint)
                            break
                    _tmp = d.crawler_result()
                    if d.dump_to_database:
                        result_['data'].append(_tmp)
                    write_to_json['data'].append(_tmp)
                    d.driver.quit()
                    result.append(dd)
        result_['company'] = _company_details
        write_to_json['company'] = _company_details
        if dump_json_data:
            dump_json_data(write_to_json)
        crawler_result.append(result_)
    return crawler_result
Пример #10
0
def create_dir(path):
    try:
        os.mkdir(path)
    except FileNotFoundError:
        head = os.path.split(path)[0]
        tail = os.path.split(path)[1]
        create_dir(head)
        return create_dir(path)
    except FileExistsError:
        return path
    except Exception as e:
        logging.error(e)
        return False
    finally:
        return path
Пример #11
0
 def scrape(self, configs):
     runner_configs = self.runner_configs
     status = None
     result = {"company": None, "data": list()}
     result_ = {"company": None, "data": list()}
     write_to_json = {"company": None, "data": list()}
     for config in configs:
         config['status'] = dict()
         crawler = config['config']
         _company_details = crawler.company_detail
         crawler.config_worker()
         # crawler.register_company()
         try:
             scraped_data = crawler.run()
         except Exception as e:
             logging.error(str(e))
             config['status']['scrape'] = {
                 "status": False,
                 "message": str(e)
             }
         else:
             config['status']['scrape'] = {
                 "status": True,
                 "message": "Success"
             }
             normalized_data = crawler.normalize(scraped_data)
             if not normalized_data:
                 return configs
             first_check = self.check_duplicate(normalized_data)
             cleaned_data = first_check
             crawler.write_result(cleaned_data)
             for key, value in normalized_data.items():
                 if not value:
                     config['status']['scrape'] = {
                         "status": False,
                         "message": "No Result!"
                     }
                     break
                 _tmp = crawler.crawler_result()
         crawler.driver.quit()
         result['company'] = _company_details
         result['data'].append(crawler.crawler_result())
         self.scrape_result = result
         self.flattened_data.append(crawler.flatten_data_result())
     return configs
Пример #12
0
 def config_worker(self):
     self.worker = Worker(self.is_headless)
     worker = self.worker
     url = self.get_url()
     worker.get(url)
     wait = get_loaded(worker.driver)
     if not wait:
         logging.error("Page not loaded")
     self.driver = worker.driver
     if self.window_size:
         logging.debug("Resizing {} x {}".format(self.window_size_x,
                                                 self.window_size_y))
         self.driver.set_window_size(int(self.window_size_x),
                                     int(self.window_size_y))
     else:
         self.driver.maximize_window()
     self.action = worker.action
     self.config_action_chains()
Пример #13
0
def send_data(es_handler, datasets):
    try:
        res = es_handler.search(index="domain_type")
        domain_type = [i["_source"]["type"] for i in res["hits"]["hits"]]
    except Exception :
        domain_type = ['.id', '.com', '.xyz', '.net', '.org', '.co.id', '.web.id', '.my.id', '.biz.id', '.ac.id', '.sch.id', '.biz', '.co', '.tv', '.io', '.info']
    result = list()
    for i in datasets:
        if i['_index'] == 'domain':
            if i['nm_domain_type'].lower() not in domain_type:
                continue
        try:
            res = es_handler.index(index=i.pop("_index"),id=i.pop("_id"),body=i)
        except Exception as e:
            logging.error(str(e))
            print(str(e))
            res = {"status": False}
        result.append(res)
    return result
Пример #14
0
 def normalize(self, dataset):
     result = list()
     for data in dataset:
         product_type = data['nm_product_type']
         try:
             self.config[product_type]
         except KeyError:
             msg = "index {} must be initiated on configuration"
             logging.error(msg)
             return list()
         else:
             __datatmp = dict()
             __datatmp["_index"] = product_type
             for key, val in self.config[product_type].items():
                 required = self.config[product_type][key].get(
                     'required', False)
                 if required:
                     try:
                         __datatmp[key] = data[key]
                     except KeyError:
                         logging.error(data)
                         msg = "{} is required for product type {}".format(
                             key, product_type)
                         logging.error(msg)
                         return list()
                 else:
                     __datatmp[key] = data.get(key, "None")
         _id = json.dumps(__datatmp)
         _id = generate_id(_id)
         __datatmp['_id'] = _id
         result.append(__datatmp)
         if 'additional_features' in data.keys():
             parent_id = _id
             __datatmp_add = {
                 "_parent_id": parent_id,
                 "_parent_index": product_type,
                 "_index": "additional_features"
             }
             __datatmp_add = {
                 **__datatmp_add,
                 **data['additional_features']
             }
             _id = json.dumps(__datatmp_add)
             _id = generate_id(_id)
             __datatmp_add["_id"] = _id
             result.append(__datatmp_add)
     return result
Пример #15
0
 def run(self):
     try:
         self.generate_actions(self.query)
         self.execute()
     except Exception as e:
         logging.error(str(e))