Exemplo n.º 1
0
def _crawl(detail_link: str, driver: WebDriver) -> Info:
    driver.get(detail_link)
    time.sleep(5)

    accordeon = driver.find_elements_by_class_name('accordeon')

    try:
        driver.find_element_by_class_name(
            'moove-gdpr-infobar-allow-all').click(
            )  # get rid of cookie banner blocking all clicks on this website
    except:
        pass

    info = _get_all_present_fields(accordeon, driver)

    return _convert(info)
Exemplo n.º 2
0
def _extract_quick_facts(
        driver: WebDriver, idx: int
) -> Tuple[Optional[int], Optional[int], Optional[int], List[str]]:
    founding_year = None
    staff_number = None
    members_number = None
    languages: List[str] = []

    possible_quick_facts_elements = driver.find_elements_by_class_name(
        'ng-scope')
    likely_quick_facts_elements = [
        p for p in possible_quick_facts_elements
        if p.find_elements_by_tag_name('strong')
    ]

    founded = [
        l for l in likely_quick_facts_elements if l.text.startswith('Founded')
    ]
    staff = [
        l for l in likely_quick_facts_elements
        if l.text.strip().endswith('staff')
    ]
    members = [
        l for l in likely_quick_facts_elements
        if l.text.strip().endswith('members')
    ]
    working_languages = [
        l for l in likely_quick_facts_elements
        if l.text.startswith('Working languages')
    ]

    if founded:
        founding_year = int(
            founded[0].find_element_by_tag_name('strong').text.strip())

    if staff:
        staff_string = staff[0].find_element_by_tag_name('strong').text.strip()
        staff_string = staff_string.replace(' ', '')
        try:
            staff_number = int(staff_string)
        except:
            if staff_string == '6fulltime+2interns':
                staff_number = 6
            if staff_string == '2.8':
                staff_number = 3
            if staff_string == '3employeesand7volunteers':
                staff_number = 3
            if staff_string == '5employees':
                staff_number = 5
            print(f'SKIPPING STAFF ({idx}) - {staff_string}')

    if members:
        members_string = members[0].find_element_by_tag_name(
            'strong').text.strip()
        members_string = members_string.replace(' ', '')
        members_string = members_string.replace('+', '')
        try:
            members_number = int(members_string)
        except:
            if members_string == '384organisations':
                members_number = 384
            if members_string == '50delegations':
                members_number = 50
            if members_string == '120,000':
                members_number = 120000
            if members_string == 'Around35':
                members_number = 35
            if members_string == 'about5000':
                members_number = 5000
            if members_string == '46nationaluniversitysportsgoverningbodies':
                members_number = 46
            if members_string == 'around1000membersin34branches':
                members_number = 1000
            if members_string == '44associations':
                members_number = 44
            if members_string == 'approx.700':
                members_number = 700
            if members_string == '5,000,000':
                members_number = 5000000
            if members_string == '250associations':
                members_number = 250
            if members_string == '28memberorganisations':
                members_number = 28

    if working_languages:
        languages = working_languages[0].find_element_by_tag_name(
            'strong').text.strip()

    return founding_year, staff_number, members_number, languages