Пример #1
0
def test_run():
    # Change other logger levels
    logging.getLogger('urllib3').setLevel(logging.WARN)
    logging.getLogger('selenium').setLevel(logging.WARN)

    scraper = LinkedinScraper(
        chrome_executable_path=None,
        chrome_options=None,
        headless=True,
        max_workers=1,
        slow_mo=1,
    )

    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.INVALID_SESSION, on_invalid_session)
    scraper.on(Events.END, on_end)

    queries = [
        Query(),

        Query(
            query='c#',
            options=QueryOptions(
                locations=['Finland'],
                optimize=False,
                apply_link=True,
                limit=33,
                filters=QueryFilters(
                    time=TimeFilters.WEEK,
                    experience=ExperienceLevelFilters.MID_SENIOR,
                )
            )
        ),

        Query(
            query='Engineer',
            options=QueryOptions(
                locations=['United States'],
                optimize=False,
                limit=27,
                filters=QueryFilters(
                    company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',
                    time=TimeFilters.MONTH,
                    type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP, TypeFilters.CONTRACT]
                )
            )
        ),
    ]

    scraper.run(
        queries=queries,
        # Global options
        options=QueryOptions(
            locations=['United Kingdom'],
            limit=10,
            optimize=True,
        )
    )
Пример #2
0
# Add event listeners

scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Data Engineer',
        options=QueryOptions(
            locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'],
            optimize=False,
            limit=940,
            filters=QueryFilters(
                relevance=RelevanceFilters.RELEVANT,
                time=TimeFilters.DAY, 

            )
        )
    ),
     Query(
        query='Data Scientist',
        options=QueryOptions(
            locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'],
            optimize=False,
            limit=940,
            filters=QueryFilters(
                relevance=RelevanceFilters.RELEVANT,
                time=TimeFilters.DAY,                
            )
        )
Пример #3
0
            scraper.on(Events.END, on_end)

            queries = [
                Query(options=QueryOptions(
                    optimize=
                    True,  # Blocks requests for resources like images and stylesheet
                    limit=0  # Limit the number of jobs to scrape
                )),
                Query(query=search,
                      options=QueryOptions(
                          locations=['Toronto, Ontario, Canada'],
                          optimize=True,
                          limit=400,
                          filters=QueryFilters(
                              relevance=RelevanceFilters.RELEVANT,
                              time=TimeFilters.WEEK,
                              type=[TypeFilters.FULL_TIME],
                          ))),
            ]

            scraper.run(queries)

            for i in range(len(title)):
                industry.append('Finance')
                occupation.append(search)
                jobType.append('Full Time')

            df = pandas.DataFrame(
                data={
                    "Title": title,
                    "Company": company,
Пример #4
0
def scrapeLinkedinJobs(industries):

    scraper = LinkedinScraper(
        # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
        chrome_executable_path=
        'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe',
        chrome_options=None,  # Custom Chrome options here
        headless=True,  # Overrides headless mode only if chrome_options is None
        max_workers=len(
            industries
        ),  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=
        2,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )
    queries = []
    for i in range(len(industries)):
        paramQ = Query(
            query=industries[i],
            options=QueryOptions(
                locations=['Singapore'],
                optimize=True,
                limit=6,
                filters=QueryFilters(
                    company_jobs_url=None,  # Filter by companies
                    relevance=RelevanceFilters.RECENT,
                    time=TimeFilters.MONTH,
                    type=[TypeFilters.FULL_TIME],
                    experience=None,
                )))
        queries.append(paramQ)

    JobList = {}

    def on_data(data: EventData):
        jobData = {}
        jobData["title"] = data.title
        jobData["company"] = data.company
        jobData["place"] = data.place
        jobData["description"] = data.description
        jobData["linkedinUrl"] = data.link
        jobData["descriptionHTML"] = data.description_html
        jobData["employmentType"] = data.employment_type
        jobData["applyUrl"] = data.apply_link
        jobData["date"] = data.date
        jobData["seniority"] = data.seniority_level
        jobData["jobFunction"] = data.job_function
        jobData["industries"] = data.industries
        jobData["skills"] = json.loads(
            extract_skills_from_document(data.description))
        if data.query not in JobList.keys():
            JobList[data.query] = []
            JobList[data.query].append(jobData)
        else:
            JobList[data.query].append(jobData)
        del data
        del jobData

    def on_error(error):
        print('[ON_ERROR]', error)

    def on_end():
        print('[ON_END]')

    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    scraper.run(queries)

    JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()]
    return JobList
Пример #5
0
# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        options=QueryOptions(
            optimize=True,  # Blocks requests for resources like images and stylesheet
            limit=27  # Limit the number of jobs to scrape
        )
    ),
    Query(
        query='Engineer',
        options=QueryOptions(
            locations=['United States'],
            optimize=False,
            limit=5,
            filters=QueryFilters(
                company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',  # Filter by companies
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.MONTH,
                type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                experience=None,
            )
        )
    ),
]

scraper.run(queries)
Пример #6
0
# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Data',
        options=QueryOptions(
            locations=['Belgium'],
            optimize=True,
            limit=10,
            filters=QueryFilters(
                company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1508%2C6754%2C3880216%2C631981%2C166278%2C963211%2C3625182%2C256009%2C157326%2C282760%2C3627928%2C1519%2C281207%2C18735883%2C10070%2C98774%2C15245937%2C3683364%2C251838%2C2642837&geoId=92000000',  # Filter by companies
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.MONTH,
                type=[TypeFilters.FULL_TIME, TypeFilters.TEMPORARY],
                experience=[ExperienceLevelFilters.ENTRY_LEVEL, ExperienceLevelFilters.MID_SENIOR],
            )
        )
    ),
]

scraper.run(queries)

## Currently not working (ie no data export)

results = scraper.run(queries)
import csv
with open("jobs.csv", "a") as csvfile:
    fieldnames = ['Title', 'Date', 'Link','ID']
Пример #7
0
    1,  # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread)
    slow_mo=
    0.4,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(options=QueryOptions(
        optimize=
        True,  # Blocks requests for resources like images and stylesheet
        limit=0  # Limit the number of jobs to scrape
    )),
    Query(query='Engineer',
          options=QueryOptions(
              locations=['Toronto, Ontario, Canada'],
              optimize=False,
              limit=5,
              filters=QueryFilters(
                  relevance=RelevanceFilters.RECENT,
                  time=TimeFilters.MONTH,
                  type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                  experience=None,
              ))),
]

scraper.run(queries)
Пример #8
0
# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

limit = 9000
queries = [
    Query(query='Cloud Engineer',
          options=QueryOptions(locations=['Canada', "United-States"],
                               optimize=False,
                               limit=limit,
                               filters=QueryFilters(
                                   relevance=RelevanceFilters.RECENT,
                                   time=TimeFilters.MONTH,
                                   experience=[
                                       ExperienceLevelFilters.INTERNSHIP,
                                       ExperienceLevelFilters.ASSOCIATE,
                                       ExperienceLevelFilters.ENTRY_LEVEL
                                   ]))),
    Query(query='Cloud architect',
          options=QueryOptions(locations=['Canada', 'United-States'],
                               optimize=False,
                               limit=limit,
                               filters=QueryFilters(
                                   relevance=RelevanceFilters.RECENT,
                                   time=TimeFilters.MONTH,
                                   experience=[
                                       ExperienceLevelFilters.INTERNSHIP,
                                       ExperienceLevelFilters.ASSOCIATE,
                                       ExperienceLevelFilters.ENTRY_LEVEL
                                   ]))),
Пример #9
0
# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='human resources',
        options=QueryOptions(
            locations=['Sharnbrook, England, United Kingdom'],
            optimize=False,
            limit=1000,
            filters=QueryFilters(
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.DAY,
                type=[TypeFilters.FULL_TIME, TypeFilters.CONTRACT, TypeFilters.TEMPORARY],
                experience=ExperienceLevelFilters.ENTRY_LEVEL,                
            )
        )
    ),
    Query(
        query='human resources',
        options=QueryOptions(
            locations=['United Kingdom'],
            optimize=False,
            limit=1000,
            filters=QueryFilters(
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.DAY,
                type=[TypeFilters.FULL_TIME, TypeFilters.CONTRACT, TypeFilters.TEMPORARY],
                experience=ExperienceLevelFilters.ENTRY_LEVEL,