def test_run(): # Change other logger levels logging.getLogger('urllib3').setLevel(logging.WARN) logging.getLogger('selenium').setLevel(logging.WARN) scraper = LinkedinScraper( chrome_executable_path=None, chrome_options=None, headless=True, max_workers=1, slow_mo=1, ) scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.INVALID_SESSION, on_invalid_session) scraper.on(Events.END, on_end) queries = [ Query(), Query( query='c#', options=QueryOptions( locations=['Finland'], optimize=False, apply_link=True, limit=33, filters=QueryFilters( time=TimeFilters.WEEK, experience=ExperienceLevelFilters.MID_SENIOR, ) ) ), Query( query='Engineer', options=QueryOptions( locations=['United States'], optimize=False, limit=27, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP, TypeFilters.CONTRACT] ) ) ), ] scraper.run( queries=queries, # Global options options=QueryOptions( locations=['United Kingdom'], limit=10, optimize=True, ) )
# Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Data Engineer', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'], optimize=False, limit=940, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.DAY, ) ) ), Query( query='Data Scientist', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'], optimize=False, limit=940, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.DAY, ) )
scraper.on(Events.END, on_end) queries = [ Query(options=QueryOptions( optimize= True, # Blocks requests for resources like images and stylesheet limit=0 # Limit the number of jobs to scrape )), Query(query=search, options=QueryOptions( locations=['Toronto, Ontario, Canada'], optimize=True, limit=400, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.WEEK, type=[TypeFilters.FULL_TIME], ))), ] scraper.run(queries) for i in range(len(title)): industry.append('Finance') occupation.append(search) jobType.append('Full Time') df = pandas.DataFrame( data={ "Title": title, "Company": company,
def scrapeLinkedinJobs(industries): scraper = LinkedinScraper( # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_executable_path= 'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe', chrome_options=None, # Custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=len( industries ), # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 2, # Slow down the scraper to avoid 'Too many requests (429)' errors ) queries = [] for i in range(len(industries)): paramQ = Query( query=industries[i], options=QueryOptions( locations=['Singapore'], optimize=True, limit=6, filters=QueryFilters( company_jobs_url=None, # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME], experience=None, ))) queries.append(paramQ) JobList = {} def on_data(data: EventData): jobData = {} jobData["title"] = data.title jobData["company"] = data.company jobData["place"] = data.place jobData["description"] = data.description jobData["linkedinUrl"] = data.link jobData["descriptionHTML"] = data.description_html jobData["employmentType"] = data.employment_type jobData["applyUrl"] = data.apply_link jobData["date"] = data.date jobData["seniority"] = data.seniority_level jobData["jobFunction"] = data.job_function jobData["industries"] = data.industries jobData["skills"] = json.loads( extract_skills_from_document(data.description)) if data.query not in JobList.keys(): JobList[data.query] = [] JobList[data.query].append(jobData) else: JobList[data.query].append(jobData) del data del jobData def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) scraper.run(queries) JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()] return JobList
# Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( options=QueryOptions( optimize=True, # Blocks requests for resources like images and stylesheet limit=27 # Limit the number of jobs to scrape ) ), Query( query='Engineer', options=QueryOptions( locations=['United States'], optimize=False, limit=5, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], experience=None, ) ) ), ] scraper.run(queries)
# Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Data', options=QueryOptions( locations=['Belgium'], optimize=True, limit=10, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1508%2C6754%2C3880216%2C631981%2C166278%2C963211%2C3625182%2C256009%2C157326%2C282760%2C3627928%2C1519%2C281207%2C18735883%2C10070%2C98774%2C15245937%2C3683364%2C251838%2C2642837&geoId=92000000', # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.TEMPORARY], experience=[ExperienceLevelFilters.ENTRY_LEVEL, ExperienceLevelFilters.MID_SENIOR], ) ) ), ] scraper.run(queries) ## Currently not working (ie no data export) results = scraper.run(queries) import csv with open("jobs.csv", "a") as csvfile: fieldnames = ['Title', 'Date', 'Link','ID']
1, # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread) slow_mo= 0.4, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query(options=QueryOptions( optimize= True, # Blocks requests for resources like images and stylesheet limit=0 # Limit the number of jobs to scrape )), Query(query='Engineer', options=QueryOptions( locations=['Toronto, Ontario, Canada'], optimize=False, limit=5, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], experience=None, ))), ] scraper.run(queries)
# Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) limit = 9000 queries = [ Query(query='Cloud Engineer', options=QueryOptions(locations=['Canada', "United-States"], optimize=False, limit=limit, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, experience=[ ExperienceLevelFilters.INTERNSHIP, ExperienceLevelFilters.ASSOCIATE, ExperienceLevelFilters.ENTRY_LEVEL ]))), Query(query='Cloud architect', options=QueryOptions(locations=['Canada', 'United-States'], optimize=False, limit=limit, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, experience=[ ExperienceLevelFilters.INTERNSHIP, ExperienceLevelFilters.ASSOCIATE, ExperienceLevelFilters.ENTRY_LEVEL ]))),
# Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='human resources', options=QueryOptions( locations=['Sharnbrook, England, United Kingdom'], optimize=False, limit=1000, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.DAY, type=[TypeFilters.FULL_TIME, TypeFilters.CONTRACT, TypeFilters.TEMPORARY], experience=ExperienceLevelFilters.ENTRY_LEVEL, ) ) ), Query( query='human resources', options=QueryOptions( locations=['United Kingdom'], optimize=False, limit=1000, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.DAY, type=[TypeFilters.FULL_TIME, TypeFilters.CONTRACT, TypeFilters.TEMPORARY], experience=ExperienceLevelFilters.ENTRY_LEVEL,