예제 #1
0
def extract_job_offer_from_feed(feed_parsed):

    create_table_db()
    for entry in feed_parsed.entries:
        add_job_link(entry.link)

    urls_scraped = extract_all_joblinks(1)
    urls_not_scraped = extract_all_joblinks(-1)

    urls = set()
    for url in urls_not_scraped:
        if url not in urls_scraped:
            urls.add(url)
    print("Number of entries will be parsed : ", len(urls))

    list_job_offers = set()
    for url in urls:
        current_job = extract_job_content_feed_url(url)
        list_job_offers.add(current_job)

    for url in urls:
        add_job_link(url)
        set_state(url, 1)

    return list_job_offers
예제 #2
0
	def test_is_duplicate(self):
		create_table_db()
		add_job_link(self.joblink1)
		set_state(self.joblink1, 1)
		add_job_link(self.joblink1)
		set_state(self.joblink1, 1)
		self.assertEqual(count_joblinks(), 1)
		delete_table_db()
예제 #3
0
	def test_is_scraped(self):
		create_table_db()
		add_job_link(self.joblink1)
		set_state(self.joblink1, 1)
		add_job_link(self.joblink2)
		set_state(self.joblink2, 1)
		self.assertTrue(is_scrarped_job_link(self.joblink1) == 1)
		self.assertTrue(is_scrarped_job_link(self.joblink2) == 1)
		self.assertTrue(is_scrarped_job_link(self.joblink3) == 0)
		delete_table_db()
예제 #4
0
	def test_extract_all_joblinks(self):
		create_table_db()
		add_job_link(self.joblink1)
		add_job_link(self.joblink2)
		urls = extract_all_joblinks(-1)
		self.assertEqual(len(urls), 2)
		self.assertEqual(urls[0], self.joblink1)
		self.assertEqual(urls[1], self.joblink2)
		delete_table_db()
		create_table_db()
		add_job_link(self.joblink1)
		set_state(self.joblink1, 1)
		add_job_link(self.joblink2)
		set_state(self.joblink2, 0)
		urls = extract_all_joblinks(0)
		self.assertEqual(len(urls), 1)
		self.assertEqual(urls[0], self.joblink2)
		delete_table_db()
예제 #5
0
def web_scrape_demo(location, url_2_scrape):
    """ Web scrape the location, extract the job offer urls and then store to xlsx """

    create_table_db()
    job_urls = web_scrape(location, url_2_scrape)
    for url in job_urls:
        add_job_link(url)

    urls_scraped = extract_all_joblinks(1)
    urls_not_scraped = extract_all_joblinks(-1)

    urls = set()
    for url in urls_not_scraped:
        if url not in urls_scraped:
            urls.add(url)

    list_job_offers = extract_job_content(urls)
    for url in urls:
        add_job_link(url)
        set_state(url, 1)

    save_to_xlsx("jobs--" + str(dt.date.today()) + "__" + location + ".xlsx",
                 list_job_offers)
예제 #6
0
	def test_set_get_state(self):
		create_table_db()
		add_job_link(self.joblink1)
		set_state(self.joblink1, 100)
		self.assertEqual(get_state(self.joblink1), 100)
		delete_table_db()
예제 #7
0
	def test_check_if_exist(self):
		create_table_db()
		self.assertFalse(check_if_exist(self.joblink1))
		add_job_link(self.joblink1)
		self.assertTrue(check_if_exist(self.joblink1))
		delete_table_db()
예제 #8
0
	def test_count_joblinks(self):
		create_table_db()
		add_job_link(self.joblink1)
		add_job_link(self.joblink2)
		self.assertEqual(count_joblinks(), 2)
		delete_table_db
예제 #9
0
	def test_create_db(self):
		create_table_db()
		self.assertTrue(os.path.exists("links.db"))
		delete_table_db()