def get_personal_details_table(url): soup = get_page(url) rows = soup.find('table', attrs={ 'cellspacing': '2', 'border': '0' }).find_all('tr')[1:] return rows
def login_action(account): """ :param account :return: page source response """ soup = get_page(urljoin(nthu_library_url.info_system, '?func=file&file_name=login1')) login_url = soup.find('form').attrs.get('action') resp = post_page(login_url, data=account.to_dict()) return resp
def crawl_available_space(): soup = get_page(nthu_library_url.available_space) infos = soup.find('section', 'status').find_all('tr') space = dict() for info in infos[1:]: item = info.find_all('td') text = item[0].text number = item[1].text space[text] = number return space
def _crawl_transfer(url): soup = get_page(url) links = soup.find('div', 'clearfix').find_all('a') transfer_detail = dict() for link in links[1:]: text = link.text link = link.get('href', '') target = urljoin(url, link) transfer_detail[text] = target return transfer_detail
def _crawl_transfer(year, url): soup = get_page(url) links = soup.find('div', 'clearfix').find_all('a') for link in links[1:]: text = link.text link = link.get('href', '') target = urljoin(url, link) sub = get_or_create(Subject, text) t = get_or_create(Examtype, TRANSFER_EXAMS) sheet = Sheet(target, year, None, sub, t) db.session.add(sheet) db.session.commit()
def _crawl_detail(url): soup = get_page(url) years = soup.find('table', 'listview').find_all('tr') department_detail = dict() for year in years[1:]: which_year = year.find_all('td')[0].text links = year.find_all('a') yearly_detail = dict() for link in links: text = link.text link = link.get('href', '') target = urljoin(url, link) yearly_detail[text] = target department_detail[which_year] = yearly_detail return department_detail
def crawl_past_year_questions(): soup = get_page(nthu_library_url.past_year_questions_url) table = soup.find_all('div', 'clearfix') blocks = table[0].find_all('div', '') for block in blocks[1:]: links = block.find_all('a') for link in links: link = link.get('href', '') url = nthu_library_url.past_year_questions + link _crawl_detail(url) transferLinks = soup.find('ul', 'list02 clearfix').find_all('a') for transferLink in transferLinks: link = transferLink.get('href', '') url = nthu_library_url.past_year_questions + link _crawl_transfer(url)
def _crawl_detail(department_name, url): depart = get_or_create(Department, department_name) soup = get_page(url) years = soup.find('table', 'listview').find_all('tr') for year in years[1:]: which_year = year.find_all('td')[0].text links = year.find_all('a') for link in links: text = link.text link = link.get('href', '') target = urljoin(url, link) sub = get_or_create(Subject, text) t = get_or_create(Examtype, AFTER_GRADUATE_EXAMS) sheet = Sheet(target, int(which_year), depart, sub, t) db.session.add(sheet) db.session.commit()
def crawl_personal_page(session_url): soup = get_page(urljoin(session_url, '?func=BOR-INFO')) tables = soup.find_all('table', attrs={'cellspacing': '2'}) # 流通狀態連結 resource_links = dict() # 圖書館流通狀態 status = {} for row in tables[0].find_all('tr'): cols = get_cols(row) key = cols[0].text.strip() a_tag = cols[1].find('a') val = a_tag.text.strip() link = re.findall("'(.*?)'", a_tag.get('href'))[0] status[key] = val resource_links[key] = link # 聯絡資料 person = {} for row in tables[1].find_all('tr'): cols = get_cols(row) key = cols[0].text.strip() or '地址' val = cols[1].text.strip() person[key] = person[key] + val if key in person else val # 管理資訊 manage = {} for row in tables[2].find_all('tr'): cols = get_cols(row) key = cols[0].text.strip() val = cols[1].text.strip() if key == '讀者權限資料': val = re.findall("borstatus='(.*)'", val)[0] manage[key] = val result = dict() result['user'] = person result['status'] = status result['user']['manage'] = manage return resource_links, result
def crawl_past_year_questions(): soup = get_page(nthu_library_url.past_year_questions_url) table = soup.find_all('div', 'clearfix') blocks = table[0].find_all('div', '') after_graduate_exams = dict() for block in blocks[1:]: links = block.find_all('a') for link in links: text = link.text link = link.get('href', '') url = nthu_library_url.past_year_questions + link target = _crawl_detail(url) after_graduate_exams[text] = target transferLinks = soup.find('ul', 'list02 clearfix').find_all('a') transfer_exams = dict() for transferLink in transferLinks: text = transferLink.text link = transferLink.get('href', '') url = nthu_library_url.past_year_questions + link target = _crawl_transfer(url) transfer_exams[text] = target return {'研究所考古題': after_graduate_exams, '轉學考考古題': transfer_exams}
def get_personal_details_table(url): soup = get_page(url) rows = soup.find('table', attrs={'cellspacing': '2', 'border': '0'}).find_all('tr')[1:] return rows
def crawl_available_space(): soup = get_page(nthu_library_url.available_space) info = soup.find('section', 'status').find_all('td') for data in info: data = data.text
def _crawl_transfer(url): soup = get_page(url) links = soup.find('div', 'clearfix').find_all('a') for link in links[1:]: link = link.get('href', '') target = urljoin(url, link)
def _crawl_detail(url): soup = get_page(url) links = soup.find('table', 'listview').find_all('a') for link in links: link = link.get('href', '') target = urljoin(url, link)