Пример #1
0
                    data.append(None)
                    data.append(None)
                else:
                    data.append(current_job.location.city)
                    data.append(current_job.location.country)
                x += 1

        x = 0
        for education in p.education:
            if x < 3:
                data.append(education.education_name)
                data.append(education.degree_name)
                data.append(education.major)
                data.append(education.year)
                x += 1

    for h in range(len(headers)):
        worksheet.write(0, h, headers[h])

    for j in range(len(data)):
        worksheet.write(i + 1, j, data[j])

workbook.close()

if any(scraper.interrupted for scraper in scrapers):
    message_to_user(
        "The scraping didnt end correctly due to Human Check. The excel file was generated but it will "
        "contain some entries reporting an error string.", config)
else:
    message_to_user('Scraping successfully ended.', config)
    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Phone from Contact Info (email)
            try:
                phone = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-phone')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Birthday from Contact Info (email)
            try:
                birthday = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-birthday')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Date Connected from Contact Info (email)
            try:
                connectedDate = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-connected')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                pass

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser.find_element_by_id(
                    'experience-section').find_elements_by_tag_name('li')
            except:
                job_positions = []

            #Get all the educations
            try:
                educations = self.browser.find_element_by_id(
                    'education-section').find_elements_by_tag_name('li')
            except:
                educations = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                headline = name_div.find_all('h2')
                headline = headline[0].get_text().strip()
                profile_name = name_loc[0].find('li').get_text().strip()
                locationNConnection = name_loc[1].find_all('li')
                location = locationNConnection[0].get_text().strip()
                try:
                    connection = locationNConnection[1].find('a').find(
                        'span').get_text().strip()
                except:
                    connection = locationNConnection[1].find(
                        'span').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Scraping the Desc (using soup)
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('lt-line-clamp__more')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                if (self.browser.execute_script(
                        "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)"
                )):
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

                else:
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

            except:
                profile_desc = []

            # print(profile_desc)

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            education_list = []
            # Parsing the job positions
            if len(educations) > 0:
                # Parse job positions to extract relative the data ranges
                educations_data_ranges = []
                x = 1
                for education in educations:
                    try:
                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'education-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            education_name = a_tags.find(
                                'h3').get_text().strip()

                        except:
                            eudcation_name = None

                        try:
                            education_degree_name = a_tags.find_all(
                                'p')[0].get_text().strip()
                        except:
                            education_degree_name = None

                        try:
                            education_major = a_tags.find_all(
                                'p')[1].get_text().strip()
                        except:
                            education_major = None

                        try:
                            education_year = a_tags.find_all(
                                'p')[2].get_text().strip()
                        except:
                            education_year = None

                            # last_job_company_name = a_tags.find_all('span')[1].get_text().strip()
                            # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip()

                            # spans = exp_section.find('ul').find('li').find_all('span')

                        #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        # last_job_location = Location()
                        # next_span_is_location = False
                        # for span in spans:
                        #     if next_span_is_location:
                        #         last_job_location.parse_string(span.get_text().strip())
                        #         break
                        #     if span.get_text().strip() == 'Location':
                        #         next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]
                        education_list.append(
                            Education(education_name=education_name,
                                      degree_name=education_degree_name,
                                      major=education_major,
                                      year=education_year))

                    except:
                        pass

            for x in range(3 - len(educations)):
                education_list.append(
                    Education(education_name=None,
                              degree_name=None,
                              major=None,
                              year=None))

            last_job = []
            # Parsing the job positions
            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                job_positions_data_ranges = []
                x = 1
                for job_position in job_positions:
                    # Get the date range of the job position
                    try:
                        date_range_element = job_position.find_element_by_class_name(
                            'pv-entity__date-range')
                        date_range_spans = date_range_element.find_elements_by_tag_name(
                            'span')
                        date_range = date_range_spans[1].text

                        job_positions_data_ranges.append(date_range)

                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'experience-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            last_job_company_name = a_tags.find_all(
                                'p')[1].get_text().strip()
                            last_job_title = a_tags.find(
                                'h3').get_text().strip()

                            spans = a_tags.find_all('span')
                        except:
                            last_job_company_name = a_tags.find_all(
                                'span')[1].get_text().strip()
                            last_job_title = exp_section.find('ul').find(
                                'li').find_all('span')[2].get_text().strip()
                            spans = exp_section.find('ul').find('li').find_all(
                                'span')

                        last_job_company_name = last_job_company_name.replace(
                            'Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        last_job_location = Location()
                        next_span_is_location = False
                        for span in spans:
                            if next_span_is_location:
                                last_job_location.parse_string(
                                    span.get_text().strip())
                                break
                            if span.get_text().strip() == 'Location':
                                next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]

                        last_job.append(
                            Job(
                                position=last_job_title,
                                company=Company(
                                    name=last_job_company_name,
                                    #industry=last_job_company_industry
                                ),
                                location=last_job_location))

                    except:
                        last_job.append(
                            Job(
                                position=None,
                                company=Company(
                                    name=None,
                                    #industry=last_job_company_industry
                                ),
                                location=None))

                for x in range(4 - len(job_positions)):
                    last_job.append(
                        Job(
                            position=None,
                            company=Company(name=None,
                                            #industry=last_job_company_industry
                                            ),
                            location=None))

                print(
                    "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n  education {} \n"
                    .format(profile_name, headline, location, connection,
                            profile_desc, email, phone, birthday,
                            connectedDate, skills, last_job[0], last_job[1],
                            last_job[2], last_job[3], education_list[0]))

                return ScrapingResult(
                    Profile(
                        profile_name, headline, location, connection,
                        connectedDate, phone, birthday, profile_desc, email,
                        skills, last_job,
                        JobHistorySummary(profile_known_graduation_date,
                                          job_positions_data_ranges),
                        education_list))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)
config = ConfigParser()
config.read('config.ini')

if config.get('system', 'os') == 'linux':
    display = Display(visible=0, size=(800, 800))
    display.start()

headless_option = len(sys.argv) >= 2 and sys.argv[1] == 'HEADLESS'

# Creation of a new instance of Chrome
browser = webdriver.Chrome(executable_path=config.get('system', 'driver'), options=get_browser_options(headless_option, config))

# Doing login on LinkedIn
linkedin_login(browser, config.get('linkedin', 'username'), config.get('linkedin', 'password'))

message_to_user('Starting reading Linkedin profiles', config)

results = []
cont = 0
for query in open(config.get('profiles_data_by_name', 'input_file_name'), "r"):
    cont += 1
    print(f"Scraping {cont}")
    query = query.split(config.get('profiles_data_by_name', 'delimiter'))

    first_name = query[0]

    last_name = query[1]

    try:
        university = query[2].split(',')
    except:
Пример #4
0
    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                email = 'N/A'

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser\
                    .find_element_by_id('experience-section')\
                    .find_elements_by_tag_name('li')
            except NoSuchElementException:
                print("job_positions is null")
                job_positions = []

            # Get all the education positions
            try:
                education_positions = self.browser\
                    .find_element_by_id('education-section')\
                    .find_elements_by_tag_name('li')
            except NoSuchElementException:
                print("job_positions is null")
                education_positions = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                profile_name = name_loc[0].find('li').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            # Parsing the job positions

            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                js = self.parsing_jobs(job_positions)
                job_positions_data_ranges = js['job_positions_data_ranges']
                Jobs_array = js['Jobs_array']
                last_job = Jobs_array[0]

                if len(education_positions) > 0:
                    eds = self.parsing_educations(education_positions)

                    return ScrapingResult(
                        Profile(
                            profile_name, email, skills, last_job,
                            JobHistorySummary(profile_known_graduation_date,
                                              job_positions_data_ranges),
                            Jobs_array, eds))

                else:
                    return ScrapingResult(
                        Profile(
                            profile_name, email, skills, last_job,
                            JobHistorySummary(profile_known_graduation_date,
                                              job_positions_data_ranges),
                            Jobs_array))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)