def findContests(spbuSite: str) -> List[str]: contests = list() for a in makeSoup(spbuSite).find_all("a"): if a.has_attr("href"): contests.append(a["href"]) del (contests[0]) # Удалить ссылку на предыдущую страницу return contests
def findDepartmentToLink(link: str) -> Tuple[Dict[str, str], int]: departments = dict() # {department: link} for table in makeSoup(link).find_all("table"): if table.has_attr("class") and "ranepa-table" in table["class"]: for tbody in table.find_all("tbody"): for a in tbody.find_all("a"): if a.has_attr("href"): department = visibleSoupToString(a) departments[department] = RANEPA_ROOT + a["href"] printDot() return departments, len(departments)
def addForm(department, speciality, link): try: for table in makeSoup(link).find_all("table"): if table.has_attr( "class") and "ranepa-table" in table["class"]: for tbody in table.find_all("tbody"): for a in tbody.find_all("a"): if a.has_attr("href"): form = visibleSoupToString(a) forms[department][speciality][ form] = RANEPA_ROOT + a["href"] formCount[0] += 1 printDot() except BaseException: logError("Ошибка в форме %s: %s" % (link, traceback.format_exc()))
def findAbits(abits: Dict[str, list], contestLink: str) -> None: soup = makeSoup(contestLink) commonData = dict() commonData[PROPERTY.DEPARTMENT] = visibleSoupToString( soup.find_all("h3")[DEPARTMENT_NAME_IDX]) siteText = soupToRawString(soup) contestsOnPage = getContestsOnPage(siteText) answer = list() for contestOnPage in contestsOnPage: answer += getAbitsFromContest(contestOnPage, commonData) abits[contestLink] = answer printDot()
def addEducationalProgram(department, speciality, form, link): try: for table in makeSoup(link).find_all("table"): if table.has_attr( "class") and "ranepa-table" in table["class"]: for tbody in table.find_all("tbody"): for a in tbody.find_all("a"): if a.has_attr("href"): educationalProgram = visibleSoupToString(a) educationalPrograms[department][speciality][ form][ educationalProgram] = RANEPA_ROOT + a[ "href"] educationalProgramCount[0] += 1 printDot() except BaseException: logError("Ошибка в образовательной программе %s: %s" % (link, traceback.format_exc()))
def addAbits(department, speciality, form, educationalProgram, link): try: abits = list() for section in makeSoup(link).find_all("section"): def sectionIsEmpty(section): return 'style="text-align:center">Список пуст</td>' in soupToRawString( section) if sectionIsEmpty(section): continue if section.has_attr("id") and section["id"] in ( "list_budget", "list_contract"): fioIdx = -42 statusIdx = -42 sumIdx = -42 individualBonusIdx = -42 originalIdx = -1 for idx, td in enumerate( section.find("thead").find_all("th")): contents = visibleSoupToString(td) if "ФИО" in contents: fioIdx = idx elif "Статус" in contents: statusIdx = idx elif "Сумма конкурсных баллов" in contents: sumIdx = idx elif "Сумма баллов по индивидуальным достижениям" in contents: individualBonusIdx = idx assert -42 not in (fioIdx, statusIdx, sumIdx, individualBonusIdx) subjectsBeginIdx = sumIdx + 1 subjects = list( map( visibleSoupToString, section.find("thead").find_all("th") [subjectsBeginIdx:individualBonusIdx])) for tr in section.find("tbody").find_all("tr"): abit = dict() abit[PROPERTY.DEPARTMENT] = department abit[PROPERTY.SPECIALITY] = speciality abit[PROPERTY.EDU_PROG] = educationalProgram abit[PROPERTY.ONLY_IN_WALLS] = form abit[PROPERTY. FOR_MONEY] = section["id"] == "list_contract" tds = tr.find_all("td") abit[PROPERTY.ABIT_NAME] = visibleSoupToString( tds[fioIdx]) abit[PROPERTY.CONTEST_TYPE] = visibleSoupToString( tds[statusIdx]) abit[PROPERTY.GRADES] = dict() try: abit[PROPERTY.SUM] = float( visibleSoupToString(tds[sumIdx])) except ValueError: abit[PROPERTY.SUM] = None for idx, subj in enumerate(subjects): try: abit[PROPERTY.GRADES][subj] = float( visibleSoupToString(tds[subjectsBeginIdx + idx])) except ValueError: abit[PROPERTY.GRADES][subj] = None abit[PROPERTY.EXTRA_BONUS] = float( visibleSoupToString(tds[individualBonusIdx])) abit[PROPERTY.ORIGINAL] = visibleSoupToString( tds[originalIdx]) == "Оригинал" abits.append(abit) abitCount[0] += len(abits) links[link] = abits printDot() except BaseException: logError("Ошибка в списке %s: %s" % (link, traceback.format_exc()))
def extractList(contestLists: Dict[str, dict], contestPage: str) -> None: soup = makeSoup(contestPage) if soup is None: return listProperties = dict() subjects = list() properties = soupToRawString(soup).split("<br/>") for property in properties: if "table" in property: continue if "Образовательная программа:" in property: listProperties[PROPERTY.EDU_PROG] = getValue(property) elif "Направление:" in property: listProperties[PROPERTY.SPECIALITY] = getValue(property) elif "Форма обучения:" in property: listProperties[PROPERTY.ONLY_IN_WALLS] = getValue(property) elif "Основа обучения:" in property: listProperties[PROPERTY.FOR_MONEY] = getValue(property) elif "ВИ " in property: subject = property[property.rfind(":") + 1:].replace("</b>", "").strip() subjects.append(subject) NAME_COL_IDX = -42 BIRTHDAY_COL_IDX = None # У магистратуры нет CONTEST_TYPE_COL_IDX = -42 SUM_COL_IDX = None # У магистратуры нет SUM_EXAM_COL_IDX = None # У магистратуры нет FIRST_GRADE_COL_IDX = None # Может не быть вступительных испытаний. Например, если пустая страница EXTRA_BONUS_COL_IDX = -42 ORIGINAL_COL_IDX = -42 for i, th in enumerate(soup.find("tr").find_all("th")): text = visibleSoupToString(th) if "Фамилия Имя Отчество" in text or "ФИО" in text: NAME_COL_IDX = i elif "Дата рождения" in text: BIRTHDAY_COL_IDX = i elif "Тип конкурса" in text: CONTEST_TYPE_COL_IDX = i elif "Σ общ" in text: SUM_COL_IDX = i elif "Σ ЕГЭ" in text: SUM_EXAM_COL_IDX = i elif FIRST_GRADE_COL_IDX is None and "ВИ " in text: FIRST_GRADE_COL_IDX = i elif "Σ ИД" in text: EXTRA_BONUS_COL_IDX = i elif "Оригинал" in text: ORIGINAL_COL_IDX = i assert -42 not in (NAME_COL_IDX, CONTEST_TYPE_COL_IDX, EXTRA_BONUS_COL_IDX, ORIGINAL_COL_IDX) abits = [] for tr in soup.find_all("tr")[1:]: tds = tr.find_all("td") abit = dict(listProperties) abit[PROPERTY.ABIT_NAME] = visibleSoupToString(tds[NAME_COL_IDX]) if BIRTHDAY_COL_IDX is not None: abit[PROPERTY.BIRTHDAY] = visibleSoupToString(tds[BIRTHDAY_COL_IDX]) abit[PROPERTY.CONTEST_TYPE] = visibleSoupToString(tds[CONTEST_TYPE_COL_IDX]) if SUM_COL_IDX is not None: abit[PROPERTY.SUM] = getFloatGrade(tds[SUM_COL_IDX]) if SUM_EXAM_COL_IDX is not None: abit[PROPERTY.SUM_EXAM] = getFloatGrade(tds[SUM_EXAM_COL_IDX]) if FIRST_GRADE_COL_IDX is None: abit[PROPERTY.GRADES] = None else: abit[PROPERTY.GRADES] = dict() for i in range(len(subjects)): abit[PROPERTY.GRADES][subjects[i]] = getFloatGrade(tds[FIRST_GRADE_COL_IDX + i]) abit[PROPERTY.EXTRA_BONUS] = getFloatGrade(tds[EXTRA_BONUS_COL_IDX]) abit[PROPERTY.ORIGINAL] = visibleSoupToString(tds[ORIGINAL_COL_IDX]) == "Да" abits.append(abit) contestLists[contestPage] = abits printDot()