Exemplos de Company em Python, exemplos de utils.Company em Python

Exemplo n.º 1

0

Exibir arquivo

    def scrape_jobs(self):

        try:
            jobs = self.browser.execute_script(
                """return (
                function(){ 
                 var jobs = []; 
                 var els = document.getElementById('experience-section').getElementsByTagName('ul')[0].getElementsByTagName('li');
                 for (var i=0;i<els.length; i++){
                    if(els[i].className!='pv-entity__position-group-role-item-fading-timeline'){   
                    if(els[i].getElementsByClassName('pv-entity__position-group-role-item-fading-timeline').length>0){ 
                     } 
                    else { 
                        try {position = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByTagName('h3')[0].innerText;} 
                        catch(err) { position = ''; } 
                        try { company_name = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__secondary-title')[0].innerText;}
                        catch (err) { company_name = ''; }
                        try{date_ranges = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__date-range')[0].getElementsByTagName('span')[1].innerText;}
                        catch (err) {date_ranges = ''; }
                        try {exp=els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByTagName('h4')[1].getElementsByTagName('span')[1].innerText;}
                        catch(err) {exp='';}        
                        try{job_location = els[i].getElementsByClassName('pv-entity__summary-info')[0].getElementsByClassName('pv-entity__location')[0].getElementsByTagName('span')[1].innerText;}
                        catch (err) {job_location = ''; }
                        try{company_url =els[i].getElementsByTagName('a')[0].href;} 
                        catch (err) {company_url = ''; }
                        jobs.push([position, company_name, company_url, date_ranges, exp, job_location]);}}}
                        return jobs; })();""")
        except WebDriverException:
            jobs = []

        parsed_jobs = []

        for job in jobs:
         try:
            company_industry, company_employees = self.scrape_company_details(job[2])

            parsed_jobs.append(
                Job(
                    position=job[0],
                    company=Company(
                        name=job[1],
                        industry=company_industry,
                        employees=company_employees,
                    ),
                    location=Location(job[5]),
                    exp=job[4],
                    date_range=job[3]
                )
            )
         except:
         	pass
        return parsed_jobs

Exemplo n.º 2

0

Exibir arquivo

Arquivo: profile_scraper.py Projeto: JustinLee38/linkedin-scraper

    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Phone from Contact Info (email)
            try:
                phone = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-phone')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Birthday from Contact Info (email)
            try:
                birthday = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-birthday')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Date Connected from Contact Info (email)
            try:
                connectedDate = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-connected')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                pass

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser.find_element_by_id(
                    'experience-section').find_elements_by_tag_name('li')
            except:
                job_positions = []

            #Get all the educations
            try:
                educations = self.browser.find_element_by_id(
                    'education-section').find_elements_by_tag_name('li')
            except:
                educations = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                headline = name_div.find_all('h2')
                headline = headline[0].get_text().strip()
                profile_name = name_loc[0].find('li').get_text().strip()
                locationNConnection = name_loc[1].find_all('li')
                location = locationNConnection[0].get_text().strip()
                try:
                    connection = locationNConnection[1].find('a').find(
                        'span').get_text().strip()
                except:
                    connection = locationNConnection[1].find(
                        'span').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Scraping the Desc (using soup)
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('lt-line-clamp__more')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                if (self.browser.execute_script(
                        "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)"
                )):
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

                else:
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

            except:
                profile_desc = []

            # print(profile_desc)

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            education_list = []
            # Parsing the job positions
            if len(educations) > 0:
                # Parse job positions to extract relative the data ranges
                educations_data_ranges = []
                x = 1
                for education in educations:
                    try:
                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'education-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            education_name = a_tags.find(
                                'h3').get_text().strip()

                        except:
                            eudcation_name = None

                        try:
                            education_degree_name = a_tags.find_all(
                                'p')[0].get_text().strip()
                        except:
                            education_degree_name = None

                        try:
                            education_major = a_tags.find_all(
                                'p')[1].get_text().strip()
                        except:
                            education_major = None

                        try:
                            education_year = a_tags.find_all(
                                'p')[2].get_text().strip()
                        except:
                            education_year = None

                            # last_job_company_name = a_tags.find_all('span')[1].get_text().strip()
                            # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip()

                            # spans = exp_section.find('ul').find('li').find_all('span')

                        #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        # last_job_location = Location()
                        # next_span_is_location = False
                        # for span in spans:
                        #     if next_span_is_location:
                        #         last_job_location.parse_string(span.get_text().strip())
                        #         break
                        #     if span.get_text().strip() == 'Location':
                        #         next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]
                        education_list.append(
                            Education(education_name=education_name,
                                      degree_name=education_degree_name,
                                      major=education_major,
                                      year=education_year))

                    except:
                        pass

            for x in range(3 - len(educations)):
                education_list.append(
                    Education(education_name=None,
                              degree_name=None,
                              major=None,
                              year=None))

            last_job = []
            # Parsing the job positions
            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                job_positions_data_ranges = []
                x = 1
                for job_position in job_positions:
                    # Get the date range of the job position
                    try:
                        date_range_element = job_position.find_element_by_class_name(
                            'pv-entity__date-range')
                        date_range_spans = date_range_element.find_elements_by_tag_name(
                            'span')
                        date_range = date_range_spans[1].text

                        job_positions_data_ranges.append(date_range)

                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'experience-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            last_job_company_name = a_tags.find_all(
                                'p')[1].get_text().strip()
                            last_job_title = a_tags.find(
                                'h3').get_text().strip()

                            spans = a_tags.find_all('span')
                        except:
                            last_job_company_name = a_tags.find_all(
                                'span')[1].get_text().strip()
                            last_job_title = exp_section.find('ul').find(
                                'li').find_all('span')[2].get_text().strip()
                            spans = exp_section.find('ul').find('li').find_all(
                                'span')

                        last_job_company_name = last_job_company_name.replace(
                            'Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        last_job_location = Location()
                        next_span_is_location = False
                        for span in spans:
                            if next_span_is_location:
                                last_job_location.parse_string(
                                    span.get_text().strip())
                                break
                            if span.get_text().strip() == 'Location':
                                next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]

                        last_job.append(
                            Job(
                                position=last_job_title,
                                company=Company(
                                    name=last_job_company_name,
                                    #industry=last_job_company_industry
                                ),
                                location=last_job_location))

                    except:
                        last_job.append(
                            Job(
                                position=None,
                                company=Company(
                                    name=None,
                                    #industry=last_job_company_industry
                                ),
                                location=None))

                for x in range(4 - len(job_positions)):
                    last_job.append(
                        Job(
                            position=None,
                            company=Company(name=None,
                                            #industry=last_job_company_industry
                                            ),
                            location=None))

                print(
                    "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n  education {} \n"
                    .format(profile_name, headline, location, connection,
                            profile_desc, email, phone, birthday,
                            connectedDate, skills, last_job[0], last_job[1],
                            last_job[2], last_job[3], education_list[0]))

                return ScrapingResult(
                    Profile(
                        profile_name, headline, location, connection,
                        connectedDate, phone, birthday, profile_desc, email,
                        skills, last_job,
                        JobHistorySummary(profile_known_graduation_date,
                                          job_positions_data_ranges),
                        education_list))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)

Exemplo n.º 3

0

Exibir arquivo

    def parsing_jobs(self, job_positions):
        job_positions_data_ranges = []
        #array of Jobs
        Jobs_array = []

        for job_position in job_positions:
            #print('job_pos.text: {0}\n--'.format(job_position.text))
            try:
                # Get the date range of the job position
                # get the date_range
                try:
                    date_range_element = job_position.find_element_by_class_name(
                        'pv-entity__date-range')
                    date_range_spans = date_range_element.find_elements_by_tag_name(
                        'span')
                    date_range = date_range_spans[1].text
                    # print('date_range: {0}'.format(date_range))
                except NoSuchElementException:
                    date_range = "N/A"

                try:
                    # get the title
                    title_range_element = job_position.find_element_by_tag_name(
                        'h3')
                    title = title_range_element.text
                    # print('title: {0}'.format(title))
                except NoSuchElementException:
                    title = "N/A"

                try:
                    # get the companyname
                    companyname_range_element = job_position.find_element_by_class_name(
                        'pv-entity__secondary-title')
                    companyname = companyname_range_element
                    companyname = companyname.text.replace(
                        'Full-time', '').replace('Part-time', '').strip()
                    # print('companyname: {0}'.format(companyname))
                except NoSuchElementException:
                    companyname = "N/A"

                try:
                    # get the company info using bautifulsoup
                    company_url_link = job_position.find_element_by_tag_name(
                        'a').get_attribute('href')
                except NoSuchElementException:
                    company_url_link = "N/A"

                try:
                    companylocation_range_element = job_position.find_element_by_class_name(
                        'pv-entity__location')
                    companylocation_spans = companylocation_range_element.find_elements_by_tag_name(
                        'span')
                    companylocation = companylocation_spans[1].text
                except NoSuchElementException:
                    companylocation = "N/A"
                # print('companylocation: {0}'.format(companylocation))

                job_positions_data_ranges.append(date_range)
                info_company = self.get_company_data(company_url_link)
                try:
                    if info_company['companyname'] == "N/A":
                        info_company['companyname'] = companyname
                    if info_company['location'].full_string == "N/A":
                        loc = Location()
                        loc.parse_string(companylocation)
                        info_company['location'] = loc
                except:
                    print("Oops!", sys.exc_info()[0], "occured.")
                    print(info_company['industry'])
                    print(info_company['companyname'])
                    print(info_company['location'])

                trabajo_oo = Job(
                    position=title.strip(),
                    company=Company(name=info_company['companyname'].strip(),
                                    industry=info_company['industry'].strip()),
                    location=info_company['location'],
                    daterange=date_range.strip())
                Jobs_array.append(trabajo_oo)
                # print(trabajo_oo)

            except:
                print("Oops!, \n{}\n{}\n{}\noccured.".format(
                    sys.exc_info()[0],
                    sys.exc_info()[1],
                    sys.exc_info()[2]))
                print("Job untacking error")
                pass

        return {
            'Jobs_array': Jobs_array,
            "job_positions_data_ranges": job_positions_data_ranges
        }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: main.py Projeto: nju-luke/inves

datettime: 2020/4/5 1:13
"""
from utils import Company, CompanyPredict, get_data, dump_company

code = '600519'
date = '20181231'


growth_rate = 1.3
invest_increase_rate = 1.15

df_profit_statement, df_balance_sheet, df_cash_flow = get_data(code, date)

df_profit_statement

base_company = Company(df_profit_statement, df_balance_sheet, df_cash_flow, date)


company_flow = [base_company]

# todo 增长率
growth_rates = [growth_rate] * 5
growth_rate_perp = 1.05
for g in growth_rates:
    cmp_ = CompanyPredict(company_flow[-1], g, invest_increase_rate)
    company_flow.append(cmp_)

dump_company(code, company_flow)

print('done')