def read_configuration(self, config): """ Azoknal az ertekeknel, ami mindenkeppen specifikus, ott a [] operatort kell hasznalni, ahol elkepzelheto deafult value, ott a get()-et - name: Tudjuk, hogy melyik oldalt szedjuk le - all_job_url: az url cim, ahol az osszes munka listazva van - all_job_container_html_tag: ha esetleg az oldal kerete valtozna, de maga az osszes munka nem, akkor ne scrapeljunk foloslegesen - single_job_html_tag: hogy az oldalon levo munkakon vegig tudjunk iteralni - cache: eltaroljuk a legutobbi scrapelt oldal html contentet, hogy csak akkor szedjuk le uj infot, amikor tenyleg valtozott az oldal - json: scrapelt adatok, ezek mennek majd tovabb a converternek """ config = config['DEFAULT'] self.provider_name = config['ProviderName'] self.base_url = config['BaseUrl'] self.all_job_url = config['AllJobUrl'] self.all_job_container_html_element = \ config['AllJobContainerHtmlElement'] self.all_job_container_html_class = config['AllJobContainerHtmlClass'] self.single_job_html_tag = config['SingleJobHtmlTag'] self.single_job_href_tag = config['SingleJobHrefTag'] self.cache = os.path.join( get_dynamic_parent_folder(self.__class__), config.get('Cache', '.cache.html')) self.json = os.path.join( get_dynamic_parent_folder(self.__class__), config.get('JSON', '.scraped_jobs.json')) self.job_attrs = {}
def __init__(self, config_file_name="scraper.ini"): config_file = os.path.join( get_dynamic_parent_folder(self.__class__), config_file_name) config = configparser.ConfigParser() config.read(config_file) self.read_configuration(config)
def __init__(self, config_file_name="converter.ini"): self.title = None self.job_type = None self.task = None self.place_of_work = None self.min_salary = None self.max_salary = None self.requirements = None self.working_hours = None self.other = None self.url = None config_file = os.path.join( get_dynamic_parent_folder(self.__class__), config_file_name) config = configparser.ConfigParser() config.read(config_file) self.read_configuration(config)
def download_and_save_current_html(self): """ Letolti a base url-en talalhato html-t, majd elmenti. :return: (elmentett file path, html_content) """ url = urljoin(self.base_url, self.all_job_url) current_html = requests.get(url).text beatufil_soup = BeautifulSoup(current_html, 'html.parser') soup = beatufil_soup.find( self.all_job_container_html_element, class_=self.all_job_container_html_class ) file_path = os.path.join( get_dynamic_parent_folder(self.__class__),'.current.html' ) with open(file_path, 'w') as f: f.write(str(soup) ) return file_path, soup
def cache_outdated(self): """ Downloads and compares the current state of the job board, and compares it to the cache. :return: True, if cache is outdated """ eu = EuDiakokScraper html = requests.get(eu.base_url).text soup = BeautifulSoup(html, 'html.parser') jobs = soup.find("div", id="munkakListaContainer") current = os.path.join( get_dynamic_parent_folder(EuDiakokScraper), ".current.html") with open(current, "w") as f: f.write(str(jobs)) cache_outdated = True if os.path.isfile(self.cache): if filecmp.cmp(current, self.cache): cache_outdated = False else: self.update_cache(str(jobs)) os.remove(current) return cache_outdated
def cache_outdated(self): """ Downloads and compares the current state of the job board, and compares it to the cache. :return: True, if cache is outdated """ sh = self.__class__ html = requests.get(sh.budapest_jobs).text soup = BeautifulSoup(html, 'html.parser') jobs = soup.find("div", class_="categories") current = os.path.join( get_dynamic_parent_folder(sh), ".current.html") with open(current, "w") as f: f.write(str(jobs)) cache_outdated = True if os.path.isfile(self.cache): if filecmp.cmp(current, self.cache): cache_outdated = False else: self.update_cache(str(jobs)) os.remove(current) return cache_outdated
def __init__(self, cache_file_name=".cache.html"): self.cache = os.path.join( get_dynamic_parent_folder(self.__class__), cache_file_name)