def get_company_data(company: str, max_pages: int = 0) -> ScrapeResults: if company is not None and len(company) > 0: search_results = search_company_name(company) if not search_results.found: return ScrapeResults(search_results.GlassdoorId, search_results.GlassdoorName, search_results.BaseReviewUrl, -1, [], search_results.BaseInterviewUrl, -1, []) else: reviews_result = Ut.scrape_list(search_results.BaseReviewUrl, Reviews.parse_html, max_pages) interviews_result = Ut.scrape_list(search_results.BaseInterviewUrl, Interviews.parse_html, max_pages) return ScrapeResults(search_results.GlassdoorId, search_results.GlassdoorName, search_results.BaseReviewUrl, reviews_result[0], reviews_result[1], search_results.BaseInterviewUrl, interviews_result[0], interviews_result[1])
def parse_company_overview(html_string) -> NameSearchResults: gd_globals = get_gd_globals(html_string) company_data = Ut.get_string_between(gd_globals, "'employer'", "}", "", False) company_data = re.sub("'name'\s*:\s*", "'name':", company_data) company_data = re.sub("'id'\s*:\s*", "'id':", company_data) glassdoor_name = Ut.get_string_between(company_data, "'name':\"", '"', "", False) glassdoor_id = "E" + Ut.get_string_between(company_data, "'id':\"", '"', "", False) if glassdoor_id == "": found = False else: found = True return NameSearchResults(glassdoor_name, glassdoor_id, found)
def search_company_name(company: str) -> NameSearchResults: company = Ut.encode_url(company) url = baseUrl + "/Reviews/company-reviews.htm?sc.keyword=" + company html_string = Ut.download_string(url) gd_globals = get_gd_globals(html_string) if re.search("'analyticsUrl'\s*:\s*\"/employerInfo", gd_globals, re.MULTILINE) is not None: print("Employer info retrieved") return parse_company_overview(html_string) else: print("Search list retrieved, taking the first result") overview_url = Ut.get_string_between(html_string, "href='/Overview/", ".htm'", "", True) if overview_url == "": print("The search did not find any matching companies") return NameSearchResults("", "", False) html_string = Ut.download_string(baseUrl + "/Overview/" + overview_url + ".htm") print("Employer info retrieved") return parse_company_overview(html_string)
def get_gd_globals(html_string) -> str: return Ut.get_string_between(html_string, "window.gdGlobals", "</script>", "", True)
def parse_html(html_string): company_name = Ut.get_string_between(html_string, "</script><title>", "Interview Questions |", "") print("Company Name: " + company_name) interview_list = Ut.get_list_of_substrings( html_string, "<li class=' empReview cf ' id='InterviewReview", "</li>") if len(interview_list) == 0: interview_list = Ut.get_list_of_substrings( html_string, "<li class=' lockedReview empReview cf ' id='InterviewReview_", "</li>") print("Interviews to parse: " + str(len(interview_list))) # Each will have a list appended output_listing = [] for main_list in range(0, len(interview_list)): row = interview_list[main_list] # Parse the current listing row_list = [] # Company Name _str = company_name row_list.append(_str) # Interview Date _str = Ut.get_string_between(row, "datetime=\"", "\">", "") row_list.append(_str) # Title (Analyst Interview) _str = Ut.get_string_between(row, "<span class='reviewer'>", "</span>", "") _str = _str.strip() row_list.append(_str) # Experience experience = Ut.get_string_between( row, "<div class='flex-grid'>", "<p class=\"strong margTopMd tightBot\">Application</p>", "") experience = experience.strip() if len(experience) == 0: experience = Ut.get_string_between( row, "<div class='flex-grid'>", "<p class=\"strong margTopMd tightBot\">Interview</p>", "") _str = decode_experience(experience) row_list.append(_str) # Offer _str = decode_offer(experience) row_list.append(_str) # Difficulty _str = decode_difficulty(experience) row_list.append(_str) # GettingInterview _Application = Ut.get_string_between( row, "<p class='applicationDetails mainText truncateThis wrapToggleStr '>", "</p>", "") _current = decode_getting_interview(_Application) row_list.append(_current) # Application row_list.append(_Application) # Interview (description/verbatim) _str = Ut.get_string_between( row, "<p class='interviewDetails mainText truncateThis wrapToggleStr '>", "</p>", "") row_list.append(_str) # Interview (Questions) _str = Ut.get_string_between( row, "<span class='interviewQuestion noPadVert truncateThis wrapToggleStr ' data-truncate-words='70'>", "class", "", True) row_list.append(_str) # append the list output_listing.append(row_list) return output_listing
def parse_html(html_string) -> List[List[str]]: html_list = Ut.get_list_of_substrings(html_string, "<div class=\"hreview\">", "empReview") print("Reviews to parse: " + str(len(html_list))) # Each will have a list appended output_listing: List[List[str]] = [] for main_list in range(0, len(html_list)): row = html_list[main_list] # Parse the current listing row_list: List[str] = [] # Company Name _str = Ut.get_string_between(html_string, "'name':\"", '"', "") row_list.append(_str) # ReviewDate _str = Ut.get_string_between(row, "dateTime=\"", "\">", "") row_list.append(_str) # Helpful count _str = Ut.get_string_between(row, "helpfulReviews", "</div>", "") _str = Ut.get_string_between(_str, "(", ")", "") _str = re.sub("[^0-9]", "", _str) row_list.append(_str) # Title (of the review) _str = Ut.get_string_between(row, "class=\"reviewLink\">", "</a>", "") _str = _str.strip("\"") row_list.append(_str) # Rating of Review _str = Ut.get_string_between( row, "<div class=\"v2__EIReviewsRatingsStylesV2__ratingNum v2__EIReviewsRatingsStylesV2__small\">", "</div>", "") _str = _str.strip("\"") row_list.append(_str) # Current Employee / Past _str = Ut.get_string_between(row, "<span class=\"authorJobTitle middle\">", "</span>", "") _str = _str.strip() _current = _str _current = Ut.get_string_between((">" + _current), ">", "Employee", "") row_list.append(_current) # Employee title _current = _str _current = Ut.get_string_between(_current + "</", "-", "</", "") row_list.append(_current) # Employee type _str = Ut.get_string_between(row, "<p class=\"mainText mb-0\">", "</p>", "") if _str.find("full", 0) != -1: row_list.append("Full time") elif _str.find("half", 0) != -1: row_list.append("Half time") else: row_list.append(_str) # Location _str = Ut.get_string_between(row, "<span class=\"authorLocation\">", "></span>", "") row_list.append(_str) # Recommends _str = Ut.get_string_between(row, "<i class", "Recommends</span>", "", False) _str = Ut.decode_box_color(_str) row_list.append(_str) # Outlook _str = Ut.get_string_between( row, "</i></div><div class=\"cell\"><span class='middle'>", "Outlook</span>", "") if _str.find("Positive", 0) != -1: row_list.append("Positive") elif _str.find("Negative", 0) != -1: row_list.append("Negative") elif _str.find("Neutral", 0) != -1: row_list.append("Neutral") else: row_list.append(_str) # CEO _str = Ut.get_string_between(row, "<i class", "CEO</span>", "", False) _str = Ut.decode_box_color(_str) row_list.append(_str) # Time Employed: _str = Ut.get_string_between(row, "<p class=\"mainText mb-0\">", "</p>", "") if _str.find("More than a year", 0) != -1: row_list.append("More than a year") elif _str.find("Less than a year", 0) != -1: row_list.append("Less than a year") else: row_list.append(_str) # Pros _str = Ut.get_string_between( row, "<p class=\"strong mb-0 mt-xsm\">Pros</p><p class=\"mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded \">", "</p>", "") row_list.append(_str) # Cons _str = Ut.get_string_between( row, "<p class=\"strong mb-0 mt-xsm\">Cons</p><p class=\"mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded \">", "</p>", "") row_list.append(_str) # Advice to management _str = Ut.get_string_between( row, "<p class=\"strong mb-0 mt-xsm\">Advice to Management</p><p class=\"mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded \">", "</p>", "") row_list.append(_str) # append the list output_listing.append(row_list) return output_listing
def parse_html(html_string) -> List[List[str]]: html_list = Ut.get_list_of_substrings( html_string, "<li class=' empReview", "</span></span></div></div></div></div></div></div></li>") print("Reviews to parse: " + str(len(html_list))) # Each will have a list appended output_listing: List[List[str]] = [] for main_list in range(0, len(html_list)): row = html_list[main_list] # Parse the current listing row_list: List[str] = [] # Company Name _str = Ut.get_string_between(html_string, "</script><title>", "Reviews |", "") row_list.append(_str) # ReviewDate _str = Ut.get_string_between(row, "datetime=\"", "\">", "") row_list.append(_str) # Helpful count _str = Ut.get_string_between(row, "<span class=\"helpfulCount subtle\">", "</span>", "") _str = _str.replace("Helpful", "") _str = _str.replace("(", "") _str = _str.replace(")", "") _str = _str.strip() row_list.append(_str) # Title (of the review) _str = Ut.get_string_between(row, "<span class=\"summary \">", "</span>", "") _str = _str.strip("\"") row_list.append(_str) # Rating of Review _str = Ut.get_string_between( row, "<span class=\"rating\"><span class=\"value-title\" title=\"", "></span>", "") _str = _str.strip("\"") row_list.append(_str) # Current Employee / Past _str = Ut.get_string_between( row, "<span class='authorJobTitle middle reviewer'>", "</span>", "") _str = _str.strip() _current = _str _current = Ut.get_string_between((">" + _current), ">", "Employee", "") row_list.append(_current) # Employee title _current = _str _current = Ut.get_string_between(_current + "</", "-", "</", "") row_list.append(_current) # Employee type _str = Ut.get_string_between(row, "<p class=' tightBot mainText'>", "</p>", "") if _str.find("full", 0) != -1: row_list.append("Full time") elif _str.find("half", 0) != -1: row_list.append("Half time") else: row_list.append(_str) # Location _str = Ut.get_string_between(row, "<span class='authorLocation middle'>", "></span>", "") row_list.append(_str) # Recommends _str = Ut.get_string_between(row, "<i class", "Recommends</span>", "", False) _str = Ut.decode_box_color(_str) row_list.append(_str) # Outlook _str = Ut.get_string_between( row, "</i></div><div class=\"cell\"><span class='middle'>", "Outlook</span>", "") if _str.find("Positive", 0) != -1: row_list.append("Positive") elif _str.find("Negative", 0) != -1: row_list.append("Negative") elif _str.find("Neutral", 0) != -1: row_list.append("Neutral") else: row_list.append(_str) # CEO _str = Ut.get_string_between(row, "<i class", "CEO</span>", "", False) _str = Ut.decode_box_color(_str) row_list.append(_str) # Time Employed: _str = Ut.get_string_between(row, "<p class=' tightBot mainText'>", "</p>", "") if _str.find("More than a year", 0) != -1: row_list.append("More than a year") elif _str.find("Less than a year", 0) != -1: row_list.append("Less than a year") else: row_list.append(_str) # Pros _str = Ut.get_string_between( row, "<p class=' pros mainText truncateThis wrapToggleStr'>", "</p>", "") row_list.append(_str) # Cons _str = Ut.get_string_between( row, "<p class=' cons mainText truncateThis wrapToggleStr'>", "</p>", "") row_list.append(_str) # Advice to management _str = Ut.get_string_between( row, "<p class=' adviceMgmt mainText truncateThis wrapToggleStr'>", "</p>", "") row_list.append(_str) # append the list output_listing.append(row_list) return output_listing