def extract_job_offer_from_feed(feed_parsed): create_table_db() for entry in feed_parsed.entries: add_job_link(entry.link) urls_scraped = extract_all_joblinks(1) urls_not_scraped = extract_all_joblinks(-1) urls = set() for url in urls_not_scraped: if url not in urls_scraped: urls.add(url) print("Number of entries will be parsed : ", len(urls)) list_job_offers = set() for url in urls: current_job = extract_job_content_feed_url(url) list_job_offers.add(current_job) for url in urls: add_job_link(url) set_state(url, 1) return list_job_offers
def test_is_duplicate(self): create_table_db() add_job_link(self.joblink1) set_state(self.joblink1, 1) add_job_link(self.joblink1) set_state(self.joblink1, 1) self.assertEqual(count_joblinks(), 1) delete_table_db()
def test_is_scraped(self): create_table_db() add_job_link(self.joblink1) set_state(self.joblink1, 1) add_job_link(self.joblink2) set_state(self.joblink2, 1) self.assertTrue(is_scrarped_job_link(self.joblink1) == 1) self.assertTrue(is_scrarped_job_link(self.joblink2) == 1) self.assertTrue(is_scrarped_job_link(self.joblink3) == 0) delete_table_db()
def test_extract_all_joblinks(self): create_table_db() add_job_link(self.joblink1) add_job_link(self.joblink2) urls = extract_all_joblinks(-1) self.assertEqual(len(urls), 2) self.assertEqual(urls[0], self.joblink1) self.assertEqual(urls[1], self.joblink2) delete_table_db() create_table_db() add_job_link(self.joblink1) set_state(self.joblink1, 1) add_job_link(self.joblink2) set_state(self.joblink2, 0) urls = extract_all_joblinks(0) self.assertEqual(len(urls), 1) self.assertEqual(urls[0], self.joblink2) delete_table_db()
def web_scrape_demo(location, url_2_scrape): """ Web scrape the location, extract the job offer urls and then store to xlsx """ create_table_db() job_urls = web_scrape(location, url_2_scrape) for url in job_urls: add_job_link(url) urls_scraped = extract_all_joblinks(1) urls_not_scraped = extract_all_joblinks(-1) urls = set() for url in urls_not_scraped: if url not in urls_scraped: urls.add(url) list_job_offers = extract_job_content(urls) for url in urls: add_job_link(url) set_state(url, 1) save_to_xlsx("jobs--" + str(dt.date.today()) + "__" + location + ".xlsx", list_job_offers)
def test_set_get_state(self): create_table_db() add_job_link(self.joblink1) set_state(self.joblink1, 100) self.assertEqual(get_state(self.joblink1), 100) delete_table_db()
def test_check_if_exist(self): create_table_db() self.assertFalse(check_if_exist(self.joblink1)) add_job_link(self.joblink1) self.assertTrue(check_if_exist(self.joblink1)) delete_table_db()
def test_count_joblinks(self): create_table_db() add_job_link(self.joblink1) add_job_link(self.joblink2) self.assertEqual(count_joblinks(), 2) delete_table_db
def test_create_db(self): create_table_db() self.assertTrue(os.path.exists("links.db")) delete_table_db()