def test_run(): # Change other logger levels logging.getLogger('urllib3').setLevel(logging.WARN) logging.getLogger('selenium').setLevel(logging.WARN) scraper = LinkedinScraper( chrome_executable_path=None, chrome_options=None, headless=True, max_workers=1, slow_mo=1, ) scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.INVALID_SESSION, on_invalid_session) scraper.on(Events.END, on_end) queries = [ Query(), Query( query='c#', options=QueryOptions( locations=['Finland'], optimize=False, apply_link=True, limit=33, filters=QueryFilters( time=TimeFilters.WEEK, experience=ExperienceLevelFilters.MID_SENIOR, ) ) ), Query( query='Engineer', options=QueryOptions( locations=['United States'], optimize=False, limit=27, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP, TypeFilters.CONTRACT] ) ) ), ] scraper.run( queries=queries, # Global options options=QueryOptions( locations=['United Kingdom'], limit=10, optimize=True, ) )
def scrape(query, numresults): scraper = LinkedinScraper( chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_options=None, # Custom Chrome options here headless=False, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=1, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query=query, options=QueryOptions( locations=['Montreal'], optimize=True, # Blocks requests for resources like images and stylesheet limit=numresults # Limit the number of jobs to scrape ) ), # Query( # query='database', # options=QueryOptions( # locations=['United States'], # optimize=False, # limit=5, # filters=QueryFilters( # # company_jobs_url='https://www.linkedin.com/jobs/search/?geoId=101174742&keywords=amazon&location=Canada', # Filter by companies # relevance=RelevanceFilters.RECENT, # time=TimeFilters.MONTH, # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], # experience=None, # ) # ) # ), ] scraper.run(queries)
Query( query='Business Intelligence', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'], optimize=False, limit=940, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.DAY, ) ) ), Query( query='Business Intelligence Analyst', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'], optimize=False, limit=940, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.DAY, ) ) ), ] scraper.run(queries) df.to_json('D:\BlankSpace.io\Code BlankSpace.io\LinkedIn.json', orient='records', lines=True)
def scrapeLinkedinJobs(industries): scraper = LinkedinScraper( # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_executable_path= 'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe', chrome_options=None, # Custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=len( industries ), # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 2, # Slow down the scraper to avoid 'Too many requests (429)' errors ) queries = [] for i in range(len(industries)): paramQ = Query( query=industries[i], options=QueryOptions( locations=['Singapore'], optimize=True, limit=6, filters=QueryFilters( company_jobs_url=None, # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME], experience=None, ))) queries.append(paramQ) JobList = {} def on_data(data: EventData): jobData = {} jobData["title"] = data.title jobData["company"] = data.company jobData["place"] = data.place jobData["description"] = data.description jobData["linkedinUrl"] = data.link jobData["descriptionHTML"] = data.description_html jobData["employmentType"] = data.employment_type jobData["applyUrl"] = data.apply_link jobData["date"] = data.date jobData["seniority"] = data.seniority_level jobData["jobFunction"] = data.job_function jobData["industries"] = data.industries jobData["skills"] = json.loads( extract_skills_from_document(data.description)) if data.query not in JobList.keys(): JobList[data.query] = [] JobList[data.query].append(jobData) else: JobList[data.query].append(jobData) del data del jobData def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) scraper.run(queries) JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()] return JobList
queries = [ Query( query='Data', options=QueryOptions( locations=['Belgium'], optimize=True, limit=10, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1508%2C6754%2C3880216%2C631981%2C166278%2C963211%2C3625182%2C256009%2C157326%2C282760%2C3627928%2C1519%2C281207%2C18735883%2C10070%2C98774%2C15245937%2C3683364%2C251838%2C2642837&geoId=92000000', # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.TEMPORARY], experience=[ExperienceLevelFilters.ENTRY_LEVEL, ExperienceLevelFilters.MID_SENIOR], ) ) ), ] scraper.run(queries) ## Currently not working (ie no data export) results = scraper.run(queries) import csv with open("jobs.csv", "a") as csvfile: fieldnames = ['Title', 'Date', 'Link','ID'] csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames) csv_writer.writeheader() csv_writer.writerows(results)
def linkedinsc(): with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json', 'r') as readfile: try: jsondata = json.load(readfile) stored_links = [] for single_data in jsondata: stored_links.append(single_data['Page_URL']) except: jsondata = [] stored_links = [] # Change root logger level (default is WARN) logging.basicConfig(level=logging.INFO) def on_data(data: EventData): # print('[ON_DATA]', data.title, data.company, data.date, data.link,data.seniority_level,data.employment_type) link = data.link link = link.split('?', 1)[0] if link not in stored_links: stored_links.append(link) print("NEW JOB FOUND !!!", link) source = requests.get(data.link).text soup = BeautifulSoup(source, 'lxml') desct = soup.find('main', class_='main').get_text(strip=True) jsondata.append({ 'name': data.title, 'company': data.company, 'address': data.place, 'deadline': data.date, 'time': data.employment_type, 'Page_URL': link, 'desct': desct, 'websitename': 'np.linkedin.com' }) def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper = LinkedinScraper( chrome_options=None, # You can pass your custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 1.5, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ # Query( # options=QueryOptions( # optimize=True, # Blocks requests for resources like images and stylesheet # limit=50 # Limit the number of jobs to scrape # ) # ), Query( query='it', options=QueryOptions( locations=['Nepal'], optimize=True, limit=70, # filters=QueryFilters( # company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies # relevance=RelevanceFilters.RECENT, # time=TimeFilters.MONTH, # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], # experience=None, # ) )) ] scraper.run(queries) with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json', 'w') as outfile: json.dump(jsondata, outfile) print("linkedin done")