示例#1
0
def get_company_data(company: str, max_pages: int = 0) -> ScrapeResults:
    if company is not None and len(company) > 0:
        search_results = search_company_name(company)

        if not search_results.found:
            return ScrapeResults(search_results.GlassdoorId, search_results.GlassdoorName, search_results.BaseReviewUrl,
                                 -1, [], search_results.BaseInterviewUrl, -1, [])
        else:
            reviews_result = Ut.scrape_list(search_results.BaseReviewUrl, Reviews.parse_html, max_pages)
            interviews_result = Ut.scrape_list(search_results.BaseInterviewUrl, Interviews.parse_html, max_pages)

            return ScrapeResults(search_results.GlassdoorId, search_results.GlassdoorName,
                                 search_results.BaseReviewUrl, reviews_result[0], reviews_result[1],
                                 search_results.BaseInterviewUrl, interviews_result[0], interviews_result[1])
示例#2
0
def parse_company_overview(html_string) -> NameSearchResults:
    gd_globals = get_gd_globals(html_string)

    company_data = Ut.get_string_between(gd_globals, "'employer'", "}", "", False)
    company_data = re.sub("'name'\s*:\s*", "'name':", company_data)
    company_data = re.sub("'id'\s*:\s*", "'id':", company_data)

    glassdoor_name = Ut.get_string_between(company_data, "'name':\"", '"', "", False)
    glassdoor_id = "E" + Ut.get_string_between(company_data, "'id':\"", '"', "", False)
    if glassdoor_id == "":
        found = False
    else:
        found = True

    return NameSearchResults(glassdoor_name, glassdoor_id, found)
示例#3
0
def search_company_name(company: str) -> NameSearchResults:
    company = Ut.encode_url(company)
    url = baseUrl + "/Reviews/company-reviews.htm?sc.keyword=" + company
    html_string = Ut.download_string(url)
    gd_globals = get_gd_globals(html_string)

    if re.search("'analyticsUrl'\s*:\s*\"/employerInfo", gd_globals, re.MULTILINE) is not None:
        print("Employer info retrieved")
        return parse_company_overview(html_string)
    else:
        print("Search list retrieved, taking the first result")

        overview_url = Ut.get_string_between(html_string, "href='/Overview/", ".htm'", "", True)
        if overview_url == "":
            print("The search did not find any matching companies")
            return NameSearchResults("", "", False)

        html_string = Ut.download_string(baseUrl + "/Overview/" + overview_url + ".htm")
        print("Employer info retrieved")
        return parse_company_overview(html_string)
示例#4
0
def get_gd_globals(html_string) -> str:
    return Ut.get_string_between(html_string, "window.gdGlobals", "</script>", "", True)
示例#5
0
def parse_html(html_string):
    company_name = Ut.get_string_between(html_string, "</script><title>",
                                         "Interview Questions |", "")
    print("Company Name: " + company_name)

    interview_list = Ut.get_list_of_substrings(
        html_string, "<li class=' empReview cf ' id='InterviewReview", "</li>")
    if len(interview_list) == 0:
        interview_list = Ut.get_list_of_substrings(
            html_string,
            "<li class=' lockedReview empReview cf ' id='InterviewReview_",
            "</li>")

    print("Interviews to parse: " + str(len(interview_list)))
    # Each will have a list appended
    output_listing = []
    for main_list in range(0, len(interview_list)):
        row = interview_list[main_list]
        # Parse the current listing
        row_list = []
        # Company Name
        _str = company_name
        row_list.append(_str)

        # Interview Date
        _str = Ut.get_string_between(row, "datetime=\"", "\">", "")
        row_list.append(_str)

        # Title (Analyst Interview)
        _str = Ut.get_string_between(row, "<span class='reviewer'>", "</span>",
                                     "")
        _str = _str.strip()
        row_list.append(_str)

        # Experience
        experience = Ut.get_string_between(
            row, "<div class='flex-grid'>",
            "<p class=\"strong margTopMd tightBot\">Application</p>", "")
        experience = experience.strip()
        if len(experience) == 0:
            experience = Ut.get_string_between(
                row, "<div class='flex-grid'>",
                "<p class=\"strong margTopMd tightBot\">Interview</p>", "")

        _str = decode_experience(experience)
        row_list.append(_str)

        # Offer
        _str = decode_offer(experience)
        row_list.append(_str)

        # Difficulty
        _str = decode_difficulty(experience)
        row_list.append(_str)

        # GettingInterview
        _Application = Ut.get_string_between(
            row,
            "<p class='applicationDetails mainText truncateThis wrapToggleStr '>",
            "</p>", "")
        _current = decode_getting_interview(_Application)
        row_list.append(_current)

        # Application
        row_list.append(_Application)

        # Interview (description/verbatim)
        _str = Ut.get_string_between(
            row,
            "<p class='interviewDetails mainText truncateThis wrapToggleStr '>",
            "</p>", "")
        row_list.append(_str)

        # Interview (Questions)
        _str = Ut.get_string_between(
            row,
            "<span class='interviewQuestion noPadVert truncateThis wrapToggleStr ' data-truncate-words='70'>",
            "class", "", True)
        row_list.append(_str)

        # append the list
        output_listing.append(row_list)

    return output_listing
def parse_html(html_string) -> List[List[str]]:
    html_list = Ut.get_list_of_substrings(html_string,
                                          "<div class=\"hreview\">",
                                          "empReview")
    print("Reviews to parse: " + str(len(html_list)))
    # Each will have a list appended
    output_listing: List[List[str]] = []
    for main_list in range(0, len(html_list)):
        row = html_list[main_list]
        # Parse the current listing
        row_list: List[str] = []
        # Company Name
        _str = Ut.get_string_between(html_string, "'name':\"", '"', "")
        row_list.append(_str)

        # ReviewDate
        _str = Ut.get_string_between(row, "dateTime=\"", "\">", "")
        row_list.append(_str)
        # Helpful count
        _str = Ut.get_string_between(row, "helpfulReviews", "</div>", "")
        _str = Ut.get_string_between(_str, "(", ")", "")
        _str = re.sub("[^0-9]", "", _str)
        row_list.append(_str)
        # Title (of the review)
        _str = Ut.get_string_between(row, "class=\"reviewLink\">", "</a>", "")
        _str = _str.strip("\"")
        row_list.append(_str)
        # Rating of Review
        _str = Ut.get_string_between(
            row,
            "<div class=\"v2__EIReviewsRatingsStylesV2__ratingNum v2__EIReviewsRatingsStylesV2__small\">",
            "</div>", "")
        _str = _str.strip("\"")
        row_list.append(_str)
        # Current Employee / Past
        _str = Ut.get_string_between(row,
                                     "<span class=\"authorJobTitle middle\">",
                                     "</span>", "")
        _str = _str.strip()
        _current = _str
        _current = Ut.get_string_between((">" + _current), ">", "Employee", "")
        row_list.append(_current)

        # Employee title
        _current = _str
        _current = Ut.get_string_between(_current + "</", "-", "</", "")
        row_list.append(_current)

        # Employee type
        _str = Ut.get_string_between(row, "<p class=\"mainText mb-0\">",
                                     "</p>", "")
        if _str.find("full", 0) != -1:
            row_list.append("Full time")
        elif _str.find("half", 0) != -1:
            row_list.append("Half time")
        else:
            row_list.append(_str)

        # Location
        _str = Ut.get_string_between(row, "<span class=\"authorLocation\">",
                                     "></span>", "")
        row_list.append(_str)

        # Recommends
        _str = Ut.get_string_between(row, "<i class", "Recommends</span>", "",
                                     False)
        _str = Ut.decode_box_color(_str)
        row_list.append(_str)

        # Outlook
        _str = Ut.get_string_between(
            row, "</i></div><div class=\"cell\"><span class='middle'>",
            "Outlook</span>", "")
        if _str.find("Positive", 0) != -1:
            row_list.append("Positive")
        elif _str.find("Negative", 0) != -1:
            row_list.append("Negative")
        elif _str.find("Neutral", 0) != -1:
            row_list.append("Neutral")
        else:
            row_list.append(_str)

        # CEO
        _str = Ut.get_string_between(row, "<i class", "CEO</span>", "", False)
        _str = Ut.decode_box_color(_str)
        row_list.append(_str)

        # Time Employed:
        _str = Ut.get_string_between(row, "<p class=\"mainText mb-0\">",
                                     "</p>", "")
        if _str.find("More than a year", 0) != -1:
            row_list.append("More than a year")
        elif _str.find("Less than a year", 0) != -1:
            row_list.append("Less than a year")
        else:
            row_list.append(_str)

        # Pros
        _str = Ut.get_string_between(
            row,
            "<p class=\"strong mb-0 mt-xsm\">Pros</p><p class=\"mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded \">",
            "</p>", "")
        row_list.append(_str)

        # Cons
        _str = Ut.get_string_between(
            row,
            "<p class=\"strong mb-0 mt-xsm\">Cons</p><p class=\"mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded \">",
            "</p>", "")
        row_list.append(_str)

        # Advice to management
        _str = Ut.get_string_between(
            row,
            "<p class=\"strong mb-0 mt-xsm\">Advice to Management</p><p class=\"mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded \">",
            "</p>", "")
        row_list.append(_str)

        # append the list
        output_listing.append(row_list)

    return output_listing
def parse_html(html_string) -> List[List[str]]:
    html_list = Ut.get_list_of_substrings(
        html_string, "<li class=' empReview",
        "</span></span></div></div></div></div></div></div></li>")
    print("Reviews to parse: " + str(len(html_list)))
    # Each will have a list appended
    output_listing: List[List[str]] = []
    for main_list in range(0, len(html_list)):
        row = html_list[main_list]
        # Parse the current listing
        row_list: List[str] = []
        # Company Name
        _str = Ut.get_string_between(html_string, "</script><title>",
                                     "Reviews |", "")
        row_list.append(_str)

        # ReviewDate
        _str = Ut.get_string_between(row, "datetime=\"", "\">", "")
        row_list.append(_str)
        # Helpful count
        _str = Ut.get_string_between(row,
                                     "<span class=\"helpfulCount subtle\">",
                                     "</span>", "")
        _str = _str.replace("Helpful", "")
        _str = _str.replace("(", "")
        _str = _str.replace(")", "")
        _str = _str.strip()
        row_list.append(_str)
        # Title (of the review)
        _str = Ut.get_string_between(row, "<span class=\"summary \">",
                                     "</span>", "")
        _str = _str.strip("\"")
        row_list.append(_str)
        # Rating of Review
        _str = Ut.get_string_between(
            row, "<span class=\"rating\"><span class=\"value-title\" title=\"",
            "></span>", "")
        _str = _str.strip("\"")
        row_list.append(_str)
        # Current Employee / Past
        _str = Ut.get_string_between(
            row, "<span class='authorJobTitle middle reviewer'>", "</span>",
            "")
        _str = _str.strip()
        _current = _str
        _current = Ut.get_string_between((">" + _current), ">", "Employee", "")
        row_list.append(_current)

        # Employee title
        _current = _str
        _current = Ut.get_string_between(_current + "</", "-", "</", "")
        row_list.append(_current)

        # Employee type
        _str = Ut.get_string_between(row, "<p class=' tightBot mainText'>",
                                     "</p>", "")
        if _str.find("full", 0) != -1:
            row_list.append("Full time")
        elif _str.find("half", 0) != -1:
            row_list.append("Half time")
        else:
            row_list.append(_str)

        # Location
        _str = Ut.get_string_between(row,
                                     "<span class='authorLocation middle'>",
                                     "></span>", "")
        row_list.append(_str)

        # Recommends
        _str = Ut.get_string_between(row, "<i class", "Recommends</span>", "",
                                     False)
        _str = Ut.decode_box_color(_str)
        row_list.append(_str)

        # Outlook
        _str = Ut.get_string_between(
            row, "</i></div><div class=\"cell\"><span class='middle'>",
            "Outlook</span>", "")
        if _str.find("Positive", 0) != -1:
            row_list.append("Positive")
        elif _str.find("Negative", 0) != -1:
            row_list.append("Negative")
        elif _str.find("Neutral", 0) != -1:
            row_list.append("Neutral")
        else:
            row_list.append(_str)

        # CEO
        _str = Ut.get_string_between(row, "<i class", "CEO</span>", "", False)
        _str = Ut.decode_box_color(_str)
        row_list.append(_str)

        # Time Employed:
        _str = Ut.get_string_between(row, "<p class=' tightBot mainText'>",
                                     "</p>", "")
        if _str.find("More than a year", 0) != -1:
            row_list.append("More than a year")
        elif _str.find("Less than a year", 0) != -1:
            row_list.append("Less than a year")
        else:
            row_list.append(_str)

        # Pros
        _str = Ut.get_string_between(
            row, "<p class=' pros mainText truncateThis wrapToggleStr'>",
            "</p>", "")
        row_list.append(_str)

        # Cons
        _str = Ut.get_string_between(
            row, "<p class=' cons mainText truncateThis wrapToggleStr'>",
            "</p>", "")
        row_list.append(_str)

        # Advice to management
        _str = Ut.get_string_between(
            row, "<p class=' adviceMgmt mainText truncateThis wrapToggleStr'>",
            "</p>", "")
        row_list.append(_str)

        # append the list
        output_listing.append(row_list)

    return output_listing