示例#1
0
def extract_globle_ceo_detail(page):
    desc = get_globle_ceo_desc(page)
    version_list = get_globle_ceo_version_info(page)
    version_num = len(version_list)
    testimonials = get_globle_ceo_testimonials(page)
    takeaways = get_globle_ceo_takeaways(page)
    video_info = get_globle_ceo_video(page)
    who_attend_desc = get_globle_ceo_who_attend_desc(page)
    language = detect_language(desc)
    overview = {
        "desc": desc,
        "video_title": video_info["video_title"],
        "video_url": video_info["video_url"]
    }

    for version in version_list:
        version["version"] = version_num
    return {
        "desc": overview,
        "version_info_list": version_list,
        "testimonials": testimonials,
        "course_takeaways": takeaways,
        "who_attend_desc": who_attend_desc,
        "languages": language,
        "duration_consecutive": True,
        "university_school": "2222_EUR",
        "type": "Onsite",
        "category_tags": [],
        'credential': ''
    }
示例#2
0
def extract_trans_detail(page):
    desc = get_trans_desc(page)
    version_list = get_trans_version_info(page)
    version_num = len(version_list)
    testimonials = get_trans_testimonials(page)
    takeaways = get_trans_takeaways(page)
    video_info = get_trans_video(page)
    who_attend_desc = get_trans_who_attend_desc(page)
    language = detect_language(desc)
    overview = {
        "desc": desc,
        "video_title": video_info["video_title"],
        "video_url": video_info["video_url"]
    }

    for version in version_list:
        version["version"] = version_num
    return {
        "overview": overview,
        "version_info_list": version_list,
        "testimonials": testimonials,
        "course_takeaways": takeaways,
        "who_attend_desc": who_attend_desc,
        "languages": language
    }
示例#3
0
def extract_bap_detail(page):
    desc = get_bap_desc(page)
    version_list = get_bap_version_info(page)
    version_num = len(version_list)
    testimonials = get_bap_testimonials(page)
    takeaways = get_bap_takeaways(page)
    video_info = get_bap_video(page)
    who_attend_desc = get_bap_who_attend_desc(page)
    language = detect_language(desc)
    overview = {
        "desc": desc,
        "video_title": video_info["video_title"],
        "video_url": video_info["video_url"]
    }

    for version in version_list:
        version["version"] = version_num
    return {
        "desc": overview,
        "version_info_list": version_list,
        "testimonials": testimonials,
        "course_takeaways": takeaways,
        "who_attend_desc": who_attend_desc,
        "languages": language,
        "credential": "",
        "duration_consecutive": True
    }
示例#4
0
def final_format_detail(details):
    for detail in details:
        detail["category_tags"] = ''
        if "languages" in detail:
            detail["languages"] = language_map(detail["languages"])
        else:
            detail["languages"] = detect_language(detail["name"])
        if detail["exec_ed_inquiry_cc_emails"].startswith("mailto:"):
            detail["exec_ed_inquiry_cc_emails"] = detail[
                "exec_ed_inquiry_cc_emails"].replace("mailto:", '')
        if "course_takeaways" not in detail:
            print(detail["url"])
        detail["credential"] = ''
        # print(f'{detail["url"]}')
        # pprint(detail["overview"])
    return details
示例#5
0
def check_attrs(details):
    detail_attrs = {
        'name': '',
        'url': '',
        'university_school': '',
        'category': '',
        'desc': '',
        'active': '',
        'type': '',
        'category_tags': '',
        'priority': 0,
        'publish': 100,
        'version': '',
        'location': '',
        'currency': '',
        'tuition_number': '',
        'tuition_note': '',
        'Repeatable': 'Y',
        'effective_date_start': '',
        'effective_date_end': '',
        'duration_consecutive': '',
        'languages': '',
        'credential': '',
        'course_takeaways': '',
        'course_faculties': [],
        'who_attend_desc': '',
        'overview': '',
        'testimonials': [],
        'exec_ed_inquiry_cc_emails': '',
        'schedule': []
    }
    final_details = []
    re_scrape_course_detail = []
    for detail in details:
        rescrape_urls = [
            "https://execedprograms.iese.edu/strategic-management/getting-things-done/",
            "https://execedprograms.iese.edu/strategic-management/artificial-intelligence/",
            "https://execedprograms.iese.edu/leadership-people-management/communication-skills/"
        ]
        if detail["url"] in rescrape_urls:
            rescrape_course = copy.deepcopy(detail)
            re_scrape_course_detail.append(rescrape_course)
            del detail
            continue
        if detail[
                'url'] == 'https://execedprograms.iese.edu/leadership-people-management/high-performance-negotiator':
            detail['version'] = 1
            detail['location'] = 'Barcelona, ----, Spanish'
            detail['type'] = 'Onsite'
            detail["effective_date_start"] = '2021-06-05'
            detail["schedule"] = [[
                detail["effective_date_start"], "", "", "formal"
            ]]
        if detail[
                "url"] == "https://execedprograms.iese.edu/strategic-management/value-creation-effective-boards/":
            detail['version'] = 1
            detail['location'] = 'Barcelona, ----, Spanish'
            detail['type'] = 'Onsite'
            detail["effective_date_start"] = '2021-05-24'
            detail["schedule"] = [[
                detail["effective_date_start"], "", "", "formal"
            ]]
        if detail[
                "url"] == "https://execedprograms.iese.edu/leadership-people-management/positive-leader/":
            detail['version'] = 1
            detail['location'] = 'Barcelona, ----, Spanish'
            detail['type'] = 'Onsite'
            detail["effective_date_start"] = '2021-10-25'
            detail["schedule"] = [[
                detail["effective_date_start"], "", "", "formal"
            ]]
        if 'tuition_note' not in detail:
            detail['tuition_note'] = ''
        duration_number = get_duration_number(detail)
        if 'tuition' in detail:
            detail['tuition_number'] = detail['tuition']
        schedule = [[
            detail.get("effective_start_date", ''),
            detail.get("effective_end_date", ''), duration_number, 'formal'
        ]]
        detail['effective_date_start'] = detail.get('effective_start_date', '')
        detail['effective_date_end'] = detail.get('effective_end_date', '')
        detail["schedule"] = schedule
        if 'overview' not in detail:
            detail["overview"] = {
                'desc': detail['desc'],
                'video_url': detail.get('video_url', ''),
                'video_title': detail.get('video_title', '')
            }
        if 'languages' not in detail:
            detail['languages'] = detect_language(detail["name"])

        location = detail.get('location', '')
        formatted_location = format_location(location, detail["url"])
        detail["location"] = formatted_location
        # print(f'location {formatted_location}')
        type = detail.get('type', '')
        formatted_type = format_type(type, location, detail['url'])
        detail['type'] = formatted_type
        if 'desc' not in detail and detail['url'] == \
                'https://executiveeducation.iese.edu/es/consejeros-directivos-seniors/transformacion-digital/':
            source = requests.get(detail['url']).content
            page = bs4.BeautifulSoup(source, 'lxml')
            detail["desc"] = get_trans_desc(page).strip()
        if 'currency' not in detail and 'tuition_number' not in detail:
            source = requests.get(detail['url']).content
            page = bs4.BeautifulSoup(source, 'lxml')
            info = extract_tuition_fee_info(page)[0]
            detail['currency'] = info['currency']
            detail['tuition_number'] = info['tuition_number']
            detail['tuition_note'] = info['tuition_note']
        if 'effective_start_date' in detail:
            detail["effective_date_start"] = detail.pop('effective_start_date')
        if 'effective_end_date' in detail:
            detail["effective_date_end"] = detail.pop('effective_end_date')
        if 'mailto:' in detail["exec_ed_inquiry_cc_emails"]:
            detail["exec_ed_inquiry_cc_emails"] = detail[
                "exec_ed_inquiry_cc_emails"].replace('mailto:', '').strip()
        final_details.append(detail)

    for detail in re_scrape_course_detail:
        url = detail['url']
        source = requests.get(url).content
        page = bs4.BeautifulSoup(source, 'lxml')
        ver1 = onsite_version_detail(page)
        ver2 = online_version_detail(page)
        info1 = {**detail, **ver1}
        info2 = {**detail, **ver2}
        final_details.append(info1)
        final_details.append(info2)
    course_set = set()
    for detail in final_details:
        for k in detail_attrs.keys():
            if k not in detail:
                course_set.add(detail["url"])
                print(f'{detail["url"]} no {k}')
            if 'version_info_list' in detail:
                del detail["version_info_list"]
    print(len(final_details))
    return final_details