Пример #1
0
def scrape(query, numresults):
    scraper = LinkedinScraper(
        chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) 
        chrome_options=None,  # Custom Chrome options here
        headless=False,  # Overrides headless mode only if chrome_options is None
        max_workers=1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=1,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )

    # Add event listeners
    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    queries = [
        Query(
            query=query,
            options=QueryOptions(
                locations=['Montreal'],
                optimize=True,  # Blocks requests for resources like images and stylesheet
                limit=numresults  # Limit the number of jobs to scrape
            )
        ),
        # Query(
        #     query='database',
        #     options=QueryOptions(
        #         locations=['United States'],
        #         optimize=False,
        #         limit=5,
        #         filters=QueryFilters(
        #             # company_jobs_url='https://www.linkedin.com/jobs/search/?geoId=101174742&keywords=amazon&location=Canada',  # Filter by companies
        #             relevance=RelevanceFilters.RECENT,
        #             time=TimeFilters.MONTH,
        #             type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
        #             experience=None,
        #         )
        #     )
        # ),
    ]

    scraper.run(queries)
Пример #2
0
    global df
    dictTemp = {"JobID": [data.job_id] , "Company": [data.company],"Title" : [data.title], "Place": [data.place],"Date": [data.date],"SeniorityLevel": [data.seniority_level],"JobFunction": [data.job_function],"EmployementType": [data.employment_type],"Industries": [data.industries],"description":[data.description]}
    dfTemp = pd.DataFrame(dictTemp)
    df = df.append(dfTemp,ignore_index = True)

def on_error(error):
    print('[ON_ERROR]', error)


def on_end():
    print('[ON_END]')

scraper = LinkedinScraper(
    chrome_executable_path='C:\chromedriver_win32\chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) 
    chrome_options=None,  # Custom Chrome options here
    headless=False,  # Overrides headless mode only if chrome_options is None
    max_workers=1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
    slow_mo=1.4,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners

scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Data Engineer',
        options=QueryOptions(
            locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'],
Пример #3
0
for counter in range(2):
    if counter == 0:
        for search in searches:

            title = []
            company = []
            date = []
            link = []
            industry = []
            occupation = []
            jobType = []

            scraper = LinkedinScraper(
                chrome_options=
                None,  # You can pass your custom Chrome options here
                max_workers=
                1,  # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread)
                slow_mo=
                1.2,  # Slow down the scraper to avoid 'Too many requests (429)' errors
            )

            # Add event listeners
            scraper.on(Events.DATA, on_data)
            scraper.on(Events.ERROR, on_error)
            scraper.on(Events.END, on_end)

            queries = [
                Query(options=QueryOptions(
                    optimize=
                    True,  # Blocks requests for resources like images and stylesheet
                    limit=0  # Limit the number of jobs to scrape
                )),
Пример #4
0
def test_run():
    # Change other logger levels
    logging.getLogger('urllib3').setLevel(logging.WARN)
    logging.getLogger('selenium').setLevel(logging.WARN)

    scraper = LinkedinScraper(
        chrome_executable_path=None,
        chrome_options=None,
        headless=True,
        max_workers=1,
        slow_mo=1,
    )

    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.INVALID_SESSION, on_invalid_session)
    scraper.on(Events.END, on_end)

    queries = [
        Query(),
        Query(query='c#',
              options=QueryOptions(
                  locations=['Finland'],
                  optimize=False,
                  limit=33,
                  filters=QueryFilters(
                      time=TimeFilters.WEEK,
                      experience=ExperienceLevelFilters.MID_SENIOR,
                  ))),
        Query(
            query='Engineer',
            options=QueryOptions(
                locations=['United States'],
                optimize=False,
                limit=27,
                filters=QueryFilters(
                    company_jobs_url=
                    'https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',
                    time=TimeFilters.MONTH,
                    type=[
                        TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP,
                        TypeFilters.CONTRACT
                    ]))),
    ]

    scraper.run(
        queries=queries,
        # Global options
        options=QueryOptions(
            locations=['United Kingdom'],
            limit=10,
            optimize=False,
        ))
Пример #5
0
def scrapeLinkedinJobs(industries):

    scraper = LinkedinScraper(
        # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
        chrome_executable_path=
        'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe',
        chrome_options=None,  # Custom Chrome options here
        headless=True,  # Overrides headless mode only if chrome_options is None
        max_workers=len(
            industries
        ),  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=
        2,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )
    queries = []
    for i in range(len(industries)):
        paramQ = Query(
            query=industries[i],
            options=QueryOptions(
                locations=['Singapore'],
                optimize=True,
                limit=6,
                filters=QueryFilters(
                    company_jobs_url=None,  # Filter by companies
                    relevance=RelevanceFilters.RECENT,
                    time=TimeFilters.MONTH,
                    type=[TypeFilters.FULL_TIME],
                    experience=None,
                )))
        queries.append(paramQ)

    JobList = {}

    def on_data(data: EventData):
        jobData = {}
        jobData["title"] = data.title
        jobData["company"] = data.company
        jobData["place"] = data.place
        jobData["description"] = data.description
        jobData["linkedinUrl"] = data.link
        jobData["descriptionHTML"] = data.description_html
        jobData["employmentType"] = data.employment_type
        jobData["applyUrl"] = data.apply_link
        jobData["date"] = data.date
        jobData["seniority"] = data.seniority_level
        jobData["jobFunction"] = data.job_function
        jobData["industries"] = data.industries
        jobData["skills"] = json.loads(
            extract_skills_from_document(data.description))
        if data.query not in JobList.keys():
            JobList[data.query] = []
            JobList[data.query].append(jobData)
        else:
            JobList[data.query].append(jobData)
        del data
        del jobData

    def on_error(error):
        print('[ON_ERROR]', error)

    def on_end():
        print('[ON_END]')

    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    scraper.run(queries)

    JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()]
    return JobList
Пример #6
0
def on_data(data: EventData):
    print('[ON_DATA]', data.title, data.company, data.date, data.link, len(data.description))


def on_error(error):
    print('[ON_ERROR]', error)


def on_end():
    print('[ON_END]')


scraper = LinkedinScraper(
    chrome_executable_path= None,  
    chrome_options=None,  # Custom Chrome options here
    headless=True,  # Overrides headless mode only if chrome_options is None
    max_workers=1,  
    slow_mo=1,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Data',
        options=QueryOptions(
            locations=['Belgium'],
            optimize=True,
def linkedinsc():

    with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json',
              'r') as readfile:
        try:
            jsondata = json.load(readfile)
            stored_links = []
            for single_data in jsondata:
                stored_links.append(single_data['Page_URL'])
        except:
            jsondata = []
            stored_links = []
    # Change root logger level (default is WARN)
    logging.basicConfig(level=logging.INFO)

    def on_data(data: EventData):
        # print('[ON_DATA]', data.title, data.company, data.date, data.link,data.seniority_level,data.employment_type)

        link = data.link
        link = link.split('?', 1)[0]
        if link not in stored_links:
            stored_links.append(link)
            print("NEW JOB FOUND !!!", link)
            source = requests.get(data.link).text
            soup = BeautifulSoup(source, 'lxml')
            desct = soup.find('main', class_='main').get_text(strip=True)
            jsondata.append({
                'name': data.title,
                'company': data.company,
                'address': data.place,
                'deadline': data.date,
                'time': data.employment_type,
                'Page_URL': link,
                'desct': desct,
                'websitename': 'np.linkedin.com'
            })

    def on_error(error):
        print('[ON_ERROR]', error)

    def on_end():
        print('[ON_END]')

    scraper = LinkedinScraper(
        chrome_options=None,  # You can pass your custom Chrome options here
        headless=True,  # Overrides headless mode only if chrome_options is None
        max_workers=1,
        # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=
        1.5,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )

    # Add event listeners
    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    queries = [
        # Query(
        #     options=QueryOptions(
        #         optimize=True,  # Blocks requests for resources like images and stylesheet
        #         limit=50  # Limit the number of jobs to scrape
        #     )
        # ),
        Query(
            query='it',
            options=QueryOptions(
                locations=['Nepal'],
                optimize=True,
                limit=70,
                # filters=QueryFilters(
                #     company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',  # Filter by companies
                #     relevance=RelevanceFilters.RECENT,
                #     time=TimeFilters.MONTH,
                #     type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                #     experience=None,
                # )
            ))
    ]

    scraper.run(queries)
    with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json',
              'w') as outfile:
        json.dump(jsondata, outfile)
    print("linkedin done")