Exemplo n.º 1
0
    def start_requests(self):
        '''return iterable of job links'''

        with CommonFuncs.get_db() as db:
            todoforsite = db.query(UnprocessedJob).filter(UnprocessedJob.bot_type == 'Indeed_Bot').all()
        if len(todoforsite) >= 100:
            return

        start_time = datetime.now()

        job_profile = CommonFuncs.get_job_profile()
        locations = CommonFuncs.get_locations_list(job_profile)
        query_list = CommonFuncs.build_query_string(job_profile=job_profile, or_delim='or', bracket1='(', bracket2=')', adv_supp=True)
        query_string = query_list[0]

        if len(query_string) == 0: return

        ##########
        # URL ENCODE EACH QUERY
        ##########
        start_urls = []
        for location in locations:
            query_dict = {'q':query_string, 'l':location}
            encoded_query = urllib.parse.urlencode(query_dict, safe='')
            job_url = JOB_SITE_LINKS['Indeed']['query'] + '&' + encoded_query
            start_urls.append(job_url)

        # CommonFuncs.log('time spent building start_urls for Indeed: ' + str(datetime.now() - start_time))

        ##########
        # GET URL RESPONSES AND CALL PARSE FUNCTION TO ITERATE OVER PAGES
        ##########
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)
Exemplo n.º 2
0
    def start_requests(self):
        '''return iterable of job links'''

        with CommonFuncs.get_db() as db:
            todoforsite = db.query(UnprocessedJob).filter(
                UnprocessedJob.bot_type == 'Ziprecruiter_Bot').all()
        if len(todoforsite) >= 100:
            return

        start_time = datetime.now()

        job_profile = CommonFuncs.get_job_profile()
        locations = CommonFuncs.get_locations_list(job_profile)
        query_list = CommonFuncs.build_query_string(job_profile=job_profile,
                                                    or_delim='',
                                                    bracket1='',
                                                    bracket2='',
                                                    adv_supp=False)

        if len(query_list) == 0: return

        ##########
        # URL ENCODE EACH QUERY
        ##########
        start_urls = []
        for location in locations:
            for query_string in query_list:
                bot = CommonFuncs.get_bot('Ziprecruiter_Bot')
                if bot.is_running:  # verify that the bot is running before continuing to the next page
                    query_dict = {'search': query_string, 'location': location}
                    encoded_query = urllib.parse.urlencode(query_dict, safe='')
                    job_url = JOB_SITE_LINKS['Ziprecruiter'][
                        'query'] + '&' + encoded_query
                    start_urls.append(job_url)
                    response = html.fromstring(requests.get(job_url).content)
                    temp = response.xpath(
                        "//menu[@class='select-menu-submenu t_filter_dropdown_titles']/a/@href"
                    )
                    temp = [
                        JOB_SITE_LINKS['Ziprecruiter']['job_site_base'] + i
                        for i in temp
                    ]
                    start_urls += temp  # append all of the links from filtering by job title
                    temp = response.xpath(
                        "//menu[@class='select-menu-submenu t_filter_dropdown_companies']/a/@href"
                    )
                    temp = [
                        JOB_SITE_LINKS['Ziprecruiter']['job_site_base'] + i
                        for i in temp
                    ]
                    start_urls += temp  # append all of the links from filtering by company
                else:
                    return

        msg = 'time spent building start_urls for Ziprecruiter: ' + str(
            datetime.now() - start_time)
        # CommonFuncs.log( msg )
        print(msg)

        ##########
        # GET URL RESPONSES AND CALL PARSE FUNCTION TO ITERATE OVER PAGES
        ##########
        print('TOTAL START URLs: ' + str(len(start_urls)))
        i = 1
        for url in start_urls:
            print('LINK#: ' + str(i) + ' WORKING ON NEW START URL: ' + url)
            yield scrapy.Request(url=url, callback=self.parse)
            i += 1
Exemplo n.º 3
0
    def get_api_results(self, desired_result_count=1):
        '''return job json objects from the indeed api.'''

        job_profile = CommonFuncs.get_job_profile()

        # GET LOCATION IN JOB PROFILE
        locations = CommonFuncs.get_locations_list(job_profile)

        # KEYWORDS CONNECTED BY OR
        query_list = CommonFuncs.build_query_string(job_profile=job_profile,
                                                    or_delim='or',
                                                    bracket1='(',
                                                    bracket2=')',
                                                    adv_supp=True)
        query_string = query_list[0]

        new_jobs_queue = queue.Queue(maxsize=0)
        new_jobs = None

        limit = '25'  # 25 is the max results per request
        lookback_period = '60'  # default lookback period
        client_id = {}
        api = None

        # CONNECT TO INDEED API FOR JOB QUERIES
        try:
            client_id = json.load(open(API_KEYS_PATH, 'r'))
            api = IndeedClient(publisher=client_id['publisher_id'])
        except:
            ValueError('No publisher id found. Filtering aborted.')

        filters = {
            'q': query_string,
            'l': '',
            'userip': "1.2.3.4",
            'useragent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2)",
            "raw": "False",
            "sort": "date",
            "radius": job_profile.radius,
            "limit": limit,
            "fromage": lookback_period,
        }

        # FIND NEW JOB JSON OBJECT USING INDEED API
        # GET NEW JOBS

        for location in locations:  # iterate over each location
            filters['l'] = location
            filters['q'] = query_string

            # THREAD-BRAINED APPROACH to get all results at once
            def get_results(i):
                '''get results and check against the db if they are new. add to queue if new'''
                filters['start'] = i
                temp_list = []
                # get 25 results, using provided filters with start index
                [
                    temp_list.append(x) for x in json.loads(
                        CommonFuncs.convertBytesToString(api.search(
                            **filters)))['results']
                ]
                [
                    new_jobs_queue.put(x) for x in temp_list
                    if new_jobs_queue.unfinished_tasks < desired_result_count
                ]

            result_count = int(
                json.loads(
                    CommonFuncs.convertBytesToString(
                        api.search(**filters)))['totalResults'])

            list_of_filter_starts = [
                str(i) for i in range(0, result_count, 25)
            ]  # build list of start positions

            for item in list_of_filter_starts:
                if not new_jobs_queue.unfinished_tasks < desired_result_count:
                    break
                get_results(item)

            new_jobs = list(
                new_jobs_queue.queue)  # append query results to list

        # RETURN JOBS
        if new_jobs:
            if desired_result_count == 1:  # just return a single job, not in a list
                return new_jobs[0]
            elif desired_result_count <= len(
                    new_jobs
            ):  # if we have more than enough new jobs, return those in a list
                return new_jobs[0:desired_result_count]
            else:  # if more than the available number of new jobs requested, return all that could be found
                return new_jobs
        else:
            return []  # if no new links found
Exemplo n.º 4
0
    def apply(self, job):
        '''apply to job, create and commit job object to db, and return job object.
        return False if the job link is invalid.'''

        job_profile = CommonFuncs.get_job_profile()

        if not CommonFuncs.is_valid_url(job):
            return False

        self.driver.get(job)

        btns = self.driver.find_elements(By.TAG_NAME, 'button')
        apply_form_opened = False
        for btn in btns:
            if "Apply" in btn.text:
                btn.click()
                apply_form_opened = True
                break

        # CREATE JOB OBJECT
        new_job = Job()
        new_job.app_date = datetime.now()
        new_job.link_to_job = job
        try:
            new_job.job_title = self.driver.find_element(
                By.CLASS_NAME, 'job_title').text
        except:
            pass
        try:
            company_link = self.driver.find_element(
                By.CLASS_NAME,
                'hiring_company_text').find_element(By.TAG_NAME, 'a')
            new_job.company = company_link.text
            new_job.company_site = company_link.get_attribute('href')
        except:
            pass
        try:
            new_job.location = self.driver.find_element(
                By.CLASS_NAME, 'location_text').text
        except:
            pass
        new_job.job_site = CommonFuncs.fetch_domain_name(job)
        new_job.applied = False

        try:
            name = self.driver.find_element(By.ID, 'name')
            name.clear()
            name.send_keys(job_profile.applicant_name)
        except:
            pass
        try:
            email = self.driver.find_element_by_id('email_address')
            email.clear()
            email.send_keys(job_profile.email)
        except:
            pass
        try:
            phone = self.driver.find_element_by_id('phone_number')
            phone.clear()
            phone.send_keys(job_profile.phone_number)
        except:
            pass
        resume_file_path = None
        try:
            resume_file_path = eval(job_profile.resume)[0]
            resume_file_path = resume_file_path.replace('/', '//')
            self.driver.find_element_by_id('resume'). \
                send_keys(resume_file_path)
        except:
            pass
        try:
            self.driver.find_element(By.ID, 'contact_create_form').submit()
            new_job.applied = True
        except:
            pass

        # 1-click apply does not have an edit resume button, so the resume must be changed after clicking the button
        if not apply_form_opened:  # if the form did not open try to click the 1-click apply btn
            links = self.driver.find_elements(By.PARTIAL_LINK_TEXT, 'pply')
            for link in links:
                try:
                    link.click()
                    self.driver.get(
                        JOB_SITE_LINKS['Ziprecruiter']['applied_jobs'])
                    applied_jobs_list = self.driver\
                        .find_element(By.CLASS_NAME, 'appliedJobsList')\
                        .find_elements(By.TAG_NAME, 'li')
                    last_job_applied_to = applied_jobs_list[0]
                    resume_edit_item = last_job_applied_to\
                        .find_element(By.CLASS_NAME, 'dropdown')\
                        .find_element(By.TAG_NAME, 'ul')\
                        .find_elements(By.TAG_NAME, 'li')[1] \
                        .find_element(By.TAG_NAME, 'a')
                    resume_edit_link = resume_edit_item.get_attribute('href')
                    self.driver.get(resume_edit_link)
                    self.driver.find_element(
                        By.ID, 'resumeInput').send_keys(resume_file_path)
                    self.driver.find_element(By.ID, 'replaceResume').submit()
                    new_job.applied = True
                    break
                except:
                    pass

        try:
            self.driver.find_element(By.ID, 'zip_resume_verify').click()
        except:
            pass
        try:
            sleep(3)
            while True:
                resumeLoading = self.driver.find_element(
                    By.ID, 'zipresumeLoading')
                if not resumeLoading.is_displayed():
                    break
                sleep(1)
        except:
            pass
        try:
            self.driver.find_element(By.ID, 'zip_resume_verify').click()
        except:
            pass

        return new_job


# if __name__ =='__main__':
#     with CommonFuncs.get_driver(visible=True) as driver:
#         z=Ziprecruiter_Bot(driver=driver)
#         jobsiteaccount = JobSiteAccount()
#         jobsiteaccount.username = "******"
#         jobsiteaccount.password = "******"
#         z.login(jobsiteaccount)
#         z.apply(n)