def main(url, page_number): program_start_time = time.time() # cp.change_ip(sudo_password) # change the ip # Update the executable path of chromedriver in config_webdriver.py if needed driver = cw.configure_webdriver() try: # Get all jobs data = get_all_jobs(driver, url, page_number) # Adding timestamp in place of date for uniqueness as it's yet to decide how to name search specific files filename = '../data/{}-blueskypeople-jobs.csv' filename = sf.format_filename(filename) data.to_csv(filename, index=False, header=False) except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() driver = cw.configure_webdriver() try: parent_url = 'https://mbrdental.co.uk/jobs/' # cp.change_ip(sudo_password) # change the ip driver.get(parent_url) # Get all job urls jobs = get_all_jobs(driver) print('Total jobs:', len(jobs)) file_name = '../data/{}-mbrdental-jobs.csv' file_name = sf.format_filename(file_name) jobs.to_csv(file_name, index=False, header=False) except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() driver = cw.configure_webdriver() sys_argv = sys.argv try: parent_url = 'https://www.diamonddentalstaff.co.uk/careers' # cp.change_ip() # change the ip driver.get(parent_url) # Gets data source from iframe tag element = driver.find_element_by_name('htmlComp-iframe') iframe_data_src = element.get_attribute('src') # Always switch window before requesting a new url using driver driver.switch_to.window(driver.window_handles[-1]) # Get all job urls filepath_job_url = sys_argv[1] print(filepath_job_url) job_urls = pd.read_csv(filepath_job_url).iloc[:, 0] print('Total jobs:', len(job_urls)) print('Total unique jobs:', len(set(job_urls))) if len(job_urls) != 0: # Get jobs dataframe data = get_all_jobs(job_urls, driver) file_name = '../data/{}-diamond-dental-staff-jobs.csv' file_name = sf.format_filename(file_name) data.to_csv(file_name, index=False) else: print('No jobs found') except Exception as e: print(str(e)) finally: driver.close() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(sudo_password): program_start_time = time.time() driver = cw.configure_webdriver() try: parent_url = 'https://www.diamonddentalstaff.co.uk/careers' driver.get(parent_url) # Gets data source from iframe tag element = driver.find_element_by_name('htmlComp-iframe') iframe_data_src = element.get_attribute('src') # Always switch window before requesting a new url using driver driver.switch_to.window(driver.window_handles[-1]) # Get all job urls driver.get(iframe_data_src) job_urls = get_all_job_urls(driver) print('Total jobs:', len(job_urls)) print('Total unique jobs:', len(set(job_urls))) if len(job_urls) != 0: # Get jobs dataframe file_name = '../data/' + '{}-diamond-dental-staff-job-url.csv' file_name = sf.format_filename(file_name) pd.Series(job_urls).to_csv(file_name, index=False,header=False) print('\tJob urls stored for diamond-dental-staff.') else: print('No jobs found') except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds/60, 1) hours = round(minutes/60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format(seconds, minutes, hours))
def main(): program_start_time = time.time() driver = cw.configure_webdriver() try: parent_url = 'https://portal.brownslocumlink.com/Jobs/Results.aspx?JobResults=1' # cp.change_ip(sudo_password) # change the ip driver.get(parent_url) # Get all job urls job_urls = get_all_job_urls(driver) print('Total jobs:', len(job_urls)) print('Total unique jobs:', len(set(job_urls))) job_urls = list(set(job_urls)) if len(job_urls) != 0: # Get jobs dataframe data = get_all_jobs(job_urls, driver) print(data) file_name = '../data/{}-brownslocum-jobs.csv' file_name = sf.format_filename(file_name) data.to_csv(file_name, index=False, header=False) else: print('No jobs found') except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() driver = cw.configure_webdriver() sys_argv = sys.argv try: parent_url = 'https://www.dentalelite.co.uk/jobs/' driver.get(parent_url) # Get all job urls filepath_job_url = sys_argv[1] print(filepath_job_url) job_urls = pd.read_csv(filepath_job_url).iloc[:, 0] print('Total jobs:', len(job_urls)) print('Total unique jobs:', len(set(job_urls))) job_urls = list(set(job_urls)) if len(job_urls) != 0: # Get jobs dataframe data = get_all_jobs(job_urls, driver) file_name = '../data/{}-dental-elite-jobs.csv' file_name = sf.format_filename(file_name) data.to_csv(file_name, index=False) else: print('No jobs found') except Exception as e: print(str(e)) finally: driver.close() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() # Update the executable path of chromedriver in config_webdriver.py if needed driver = cw.configure_webdriver() sys_argv = sys.argv try: parent_url = "https://www.tempdent.co.uk/jobs" driver.get(parent_url) # Get all job urls filepath_job_url = sys_argv[1] print(filepath_job_url) job_urls = pd.read_csv(filepath_job_url).iloc[:, 0] print('Total jobs:', len(jobs)) print('Total unique jobs:', len(set(jobs))) jobs = list(set(jobs)) if len(job_urls) != 0: data = get_all_jobs(job_urls, driver) filename = '../data/{}-tempdent-jobs.csv' filename = sf.format_filename(filename) data.to_csv(filename, index=False) else: print('No jobs') except Exception as e: print(str(e)) finally: driver.close() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() driver = cw.configure_webdriver() try: parent_url = 'https://www.dentalelite.co.uk/jobs/' # cp.change_ip(sudo_password) # change the ip driver.get(parent_url) # Get all job urls job_urls = get_all_job_urls(driver) print('Total jobs:', len(job_urls)) print('Total unique jobs:', len(set(job_urls))) job_urls = list(set(job_urls)) if len(job_urls) != 0: # Get jobs dataframe file_name = '../data/' + '{}-dental-elite-job-url.csv' file_name = sf.format_filename(file_name) pd.Series(job_urls).to_csv(file_name, index=False, header=False) print('\tJob urls stored for dental-elite.') else: print('No jobs found') except Exception as e: print(str(e)) finally: driver.close() # Remove the command file # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() # Update the executable path of chromedriver in config_webdriver.py if needed driver = cw.configure_webdriver() # cp.change_ip(sudo_password) # change the ip try: parent_url = "https://www.tempdent.co.uk/jobs" driver.get(parent_url) # Get all job urls job_urls = list(set(get_all_job_urls(driver, parent_url))) print('Total jobs:', len(job_urls)) print('Total unique jobs:', len(set(job_urls))) job_urls = list(set(job_urls)) if len(job_urls) != 0: # Get jobs dataframe file_name = '../data/' + '{}-tempdent-job-url.csv' file_name = sf.format_filename(file_name) pd.Series(job_urls).to_csv(file_name, index=False, header=False) print('\tJob urls stored for tempdent.') else: print('No jobs') except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
def main(): program_start_time = time.time() driver = cw.configure_webdriver() try: url = 'https://www.medicruit.co.uk/dental-jobs/?position-type=&location=&work-type=-1&position-date=-1&page=' # cp.change_ip(sudo_password) # change the ip driver.switch_to.window(driver.window_handles[-1]) driver.get(url + str(1)) pages = driver.find_element_by_class_name('pagination') total_pages = int( pages.find_elements_by_tag_name('a')[-1].get_attribute( 'href').split('page=')[1]) print('Total pages:', str(total_pages)) data = get_all_jobs(driver, total_pages, url) file_name = '../data/{}-medcruit-jobs.csv' file_name = sf.format_filename(file_name) data.to_csv(file_name, index=False, header=False) except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print("\n\nRun time = {} seconds; {} minutes; {} hours".format( seconds, minutes, hours))
try: program_start_time = time.time() driver = cw.configure_webdriver() driver.switch_to.window(driver.window_handles[-1]) filepath_job_url = sys_argv[1] print(filepath_job_url) search_term = filepath_job_url.split('url')[-1].split('.')[0][1:] job_urls = pd.read_csv(filepath_job_url).iloc[:,0] if len(job_urls) != 0: print('Total jobs for {}: {}'.format(search_term, len(job_urls))) data = get_all_jobs(job_urls, driver, search_term) file_name = '../data/' + '{}-indeed-jobs-' + search_term + '.csv' file_name = sf.format_filename(file_name) data.to_csv(file_name, index=False) print('Data stored for: {}. shape = {}'.format(search_term, data.shape)) else: print('No jobs found for file with urls {}', search_term) except Exception as e: print(str(e)) finally: driver.close() seconds = round(time.time() - program_start_time) minutes = round(seconds/60, 1) hours = round(minutes/60, 1) print('DONE! ')
def main(job_title_list): program_start_time = time.time() # Update the executable path of chromedriver in config_webdriver.py if needed driver = cw.configure_webdriver() # cp.change_ip(sudo_password) # change the ip try: for i in range(len(job_title_list)): job_url_list = [] parent_url = 'https://www.indeed.co.uk/jobs?q=' + job_title_list[ i] + '&l=United+Kingdom' print('\n\nSearch term:', job_title_list[i]) start = time.time() driver.switch_to.window(driver.window_handles[-1]) driver.get(parent_url) # Get all job urls job_url_list = get_all_job_urls(driver) job_url_list = list(set(job_url_list)) print('\tTotal jobs for {}: {}'.format(job_title_list[i], len(set(job_url_list)))) print('\tTotal unique jobs for {}: {}'.format( job_title_list[i], len(set(job_url_list)))) if len(job_url_list) != 0: # Get jobs dataframe file_name = '../data/' + '{}-indeed-job-url-' + job_title_list[ i].replace('+', '-') + '.csv' file_name = sf.format_filename(file_name) pd.Series(job_url_list).to_csv(file_name, index=False, header=False) print('\tJob urls stored for:', job_title_list[i]) else: print('No jobs found for search term', job_title_list[i]) seconds = round(time.time() - start) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print( "\tRun time for {} = {} seconds; {} minutes; {} hours.\n\n\n". format(job_title_list[i], seconds, minutes, hours)) except Exception as e: print(str(e)) finally: driver.close() # sf.ip_files_cleanup() seconds = round(time.time() - program_start_time) minutes = round(seconds / 60, 1) hours = round(minutes / 60, 1) print('DONE! ') print( "\n\nRun time for all job searches on indeed = {} seconds; {} minutes; {} hours" .format(seconds, minutes, hours))