def gather_name_link_of_employees_mehmat(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') employees = [] iframe = soup.find('iframe') if iframe: outer_src = iframe.get('src') html = tools.get_html(outer_src) soup = BeautifulSoup(html, 'lxml') spans = soup.find_all('span', class_='fio') employees = [] for span in spans: a = span.find('a') if a: employees.append((a.text, a.get('href'))) else: tbody = soup.find_all('tbody')[0] trs = tbody.find_all('tr') for tr in trs: td = tr.find('td') p = td.find('p') a = p.find_all('a')[0] if a: employees.append((a.text, a.get('href'))) return employees
def gather_name_link_of_employees(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') employees = [] iframe = soup.find('iframe') div = soup.find('div', class_='visit_link') if iframe: outer_src = iframe.get('src') html = tools.get_html(outer_src) soup = BeautifulSoup(html, 'lxml') spans = soup.find_all('span', class_='fio') for span in spans: a = span.find('a') if a: employees.append((a.text, a.get('href'))) elif div: ps = div.find_all('p') for p in ps: a = p.find('a') if a: if 'КФУ' != a.text and 'Институт' not in a.text: employees.append((a.text, a.get('href'))) return employees
def gather_name_link_of_employees_imo(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') employees = [] iframe = soup.find('iframe') if iframe: outer_src = iframe.get('src') html = tools.get_html(outer_src) soup = BeautifulSoup(html, 'lxml') spans = soup.find_all('span', class_='fio') employees = [] for span in spans: a = span.find('a') if a: employees.append((a.text, a.get('href'))) else: div = soup.find('div', class_='visit_link') p = div.find('p') for row in p.text.split('\r\n'): employees.append((row, None)) return employees
def urls_with_reviews(news_url, channel): ''' 获取存在评论的url 过滤url :param news_url: :param channel: :param page: 最小值1 :return: [num, data_url] ''' path = urlsplit(news_url).path[::-1] start_index = path.index(".") + 1 second = path.index("-") id = path[start_index:second][::-1] id = id[1:] # 去掉i 新闻id middle = "channel=" + channel + "&newsid=comos-" + id comments_url = "http://comment5.news.sina.com.cn/comment/skin/default.html?" + middle + "&group=0" # 新闻评论网页 data_url = url_with_reviews_data(channel=channel, id=id) # 获取新闻评论数据的接口 data = get_html(data_url) data = json.loads(data) try: num = data["result"]["count"]["show"] # 获取评论的数量 except Exception as e: raise SinaException(data_url) if int(num) > 0: return [num, data_url] return None
def get_data(page_num, url): data = [] for i in range(1, page_num + 1): url_ = url.replace("page=1", "page=" + str(i)) data_ = get_html(url_) # 获取评论数据 data.append(data_) return {"data": data}
def get_phys_teachers_rad_astr(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') teachers_list = soup.select('.visit_link ul li a[href]') rad_astr = [] for i in teachers_list: rad_astr.append((i.text, i.get('href'))) return rad_astr
def get_phys_teachers(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') teachers_list = soup.select('p a[href]') employees = [] for i in teachers_list: employees.append((i.text, i.get('href'))) return employees
def gather_name_link_of_employees(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') iframe = soup.find('iframe') outer_src = iframe.get('src') html = tools.get_html(outer_src) soup = BeautifulSoup(html, 'lxml') spans = soup.find_all('span', class_='fio') employees = [] for span in spans: a = span.find('a') if a: employees.append((a.text, a.get('href'))) return employees
def get_info_from_html(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='visit_link') p = div.find_all('p') result = [] for current in p: result.append(current.text) return result
def get_link_from_menu_list_left(link, button_name: str): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') ul = soup.find('ul', class_='menu_list_left') lis = ul.find_all('li') for li in lis: a = li.find('a') if a.text == button_name: return a.get('href')
def gather_name_link_of_cathedras_of_engineer(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='area_width') links = div.find_all('a') cathedras = [] for a in links: if a.text.startswith('Кафедра'): cathedras.append((a.text, a.get('href'))) return cathedras
def gather_name_link_of_cathedras_of_imo(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='visit_link') links = div.find_all('a') cathedras = [] for a in links: if 'Кафедра' in a.text: cathedras.append((a.text, a.get('href'))) return cathedras
def gather_name_link_of_employess_it_licey(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') uls = soup.find_all('ul', class_='menu_list') employees = [] for ul in uls: links = ul.find_all('a') for link in links: if "@" not in link.text: employees.append((link.text, link.get('href'))) return employees
def gather_name_link_of_cathedras_of_psychology(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') ul = soup.find('ul', class_='menu_list_left') lis = ul.find_all('li') cathedras = [] for li in lis: a = li.find('a') cathedras.append((a.text, a.get('href'))) return cathedras
def gather_name_link_of_cathedras_of_ipot(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='visit_link') links = div.find_all('a') cathedras = [] for a in links: if a.text.startswith('Центр'): cathedras.append((a.text, a.get('href'))) return cathedras
def gather_link_of_schools(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') ul = soup.find('ul', class_='menu_list_left') lis = ul.find_all('li') schools = [] for li in lis: a = li.find('a') if a.text.startswith('Высшая школа'): schools.append(a.get('href')) return schools
def get_links_from_menu_list_left(link, button_name: str): # get many links, not once html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') ul = soup.find('ul', class_='menu_list_left') lis = ul.find_all('li') links_res = [] for li in lis: a = li.find('a') if a.text.startswith == button_name: links_res.append(a.get('href')) return links_res
def gather_name_link_of_cathedras_of_chill(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') uls = soup.find_all('ul', class_='menu_list') cathedras = [] for ul in uls: lis = ul.find_all('li') for li in lis: a = li.find('a') if a: if a.text.startswith('кафедра'): cathedras.append((a.text, a.get('href'))) return cathedras
def gather_name_link_of_employees_engineer(link): html = tools.get_html(link) if html is None: return soup = BeautifulSoup(html, 'lxml') div = soup.find('table', class_='cke_show_border') links = div.find_all('a') employees = [] for a in links: if a: employees.append((a.text, a.get('href'))) return employees
def gather_name_link_of_psychology_employees(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='visit_link') links = div.find_all('a') employees = [] for link in links: if link: employees.append((link.text, link.get('href'))) return employees
def gather_name_link_of_cathedras_of_mehmat(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') uls = soup.find_all('ul', class_='menu_list') list_links = [] for ul in uls: list_links.append(ul.find_all('a')) cathedras = [] for list in list_links: for link in list: if link.text.startswith('Кафедpа') or link.text.startswith('Кафедра'): cathedras.append((link.text, link.get('href'))) return cathedras
def gather_name_link_of_cathedras_of_phys(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='visit_link') serach_cat = {} links = div.find_all('a') cathedras = [] for a in links: if a.text.startswith('Кафедра'): cathedras.append((a.text, a.get('href'))) serach_cat[a.text] = 'Сотрудники' return cathedras, serach_cat
def gather_name_link_of_cathedras_of_ivmiit(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='visit_link') uls = div.find_all('ul') lis = [] for ul in uls: lis += ul.find_all('li', class_='li_spec') cathedras = [] for li in lis: a = li.find('a') if a.text.startswith('Кафедра'): cathedras.append((a.text, a.get('href'))) return cathedras
def main(): html = tools.get_html(constants.initial_url) institutes = gather_name_link_of_institutes_and_branches(html) print(f'институты: {institutes}') print(f'количество институтов: {len(institutes)}') parsing_dictionary = { 'Институт экологии и природопользования': parse_geogr, 'Институт геологии и нефтегазовых технологий': None, 'Институт математики и механики им. Н.И. Лобачевского': parse_mehmat, 'Институт физики': parse_phys, 'Химический институт им. А.М. Бутлерова': parse_chem, 'Юридический факультет': parse_law, 'Институт вычислительной математики и информационных технологий': parse_ivmiit, 'Институт филологии и межкультурной коммуникации': parse_philology, 'Институт психологии и образования': parse_psychology, 'Общеуниверситетская кафедра физического воспитания и спорта': parse_physical, 'Институт информационных технологий и интеллектуальных систем': None, 'Институт фундаментальной медицины и биологии': None, 'Инженерный институт': parse_engineer, 'Институт международных отношений': parse_imo, 'Высшая школа бизнеса': parse_higher_school_buisness, 'Институт социально-философских наук и массовых коммуникаций': None, 'Институт управления, экономики и финансов': None, 'Высшая школа государственного и муниципального управления': None, 'Центр корпоративного обучения': None, 'IT-лицей-интернат КФУ': parse_IT_licey, 'Лицей имени Н.И.Лобачевского': parse_lobach_licey, 'Подготовительный факультет для иностранных учащихся': None, 'Приволжский центр повышения квалификации и профессиональной переподготовки работников образования': None, 'Центр непрерывного повышения профессионального мастерства педагогических работников': None, 'Медико-санитарная часть ФГАОУ ВО КФУ': None, 'Центр цифровых трансформаций': None, 'Институт передовых образовательных технологий': parse_ipot, 'Набережночелнинский институт КФУ': parse_chill, 'Елабужский институт КФУ': None} data = {} for name, link in institutes: func = parsing_dictionary.get(name) if func: data[name] = func(link) # pprint(data) create_visualization(data)
def get_news_detail_url(index_url, out_path=None): ''' :param out_home_page: :param out_home_page_name: :return: ''' channel = get_channel(url=index_url) # 获取新闻频道 home_html = get_html(index_url).strip() # 访问的新闻类型url news_data = parse_home_data(home_html) # 解析新闻数据 if out_path != None: mkdir(out_path) save_data_txt(out_path, channel + "_resource.txt", home_html) # 保存数据 to_csv(out_path, channel + "_parsed.csv", news_data) result = dict() for news in news_data: news_id = news[0] url = news[2] # tmp = {"news_id":news_id, "url":url, "channel":channel} result[str(news_id)] = (url, channel) return result
def gather_name_link_of_cathedras_of_law(link): html = tools.get_html(link) soup = BeautifulSoup(html, 'lxml') uls = soup.find_all('ul', class_='menu_list') lis = [] search_cat = {} i = 0 for ul in uls: if i < 2: lis += ul.find_all('li', class_='li_spec') i += 1 cathedras = [] for li in lis: a = li.find('a') if a.text.startswith('Кафедра'): cathedras.append((a.text, a.get('href'))) search_cat[a.text] = 'Сотрудники' return cathedras, search_cat
import tools import nationwide_make_urls from lxml import etree ''' 爬取招聘网站的主方法 ''' # 获取全国的java岗位链接 nationwide_java_urls = nationwide_make_urls.get_nationwide_urls() for province_name in nationwide_java_urls: # 每个省的url province_url = nationwide_java_urls[province_name] # 获取每个省共有多少页 html = tools.get_html(province_url) content = etree.ElementTree(etree.HTML(html)) page_numbers = content.xpath( r'//*[@class="p_in"]/span[@class="td"]/text()') if not len(page_numbers) == 0: # 对获取的页码数字符串进行处理 page_number = page_numbers[0].split('页')[0][1:] else: page_number = 1 # 获取每个省有多少子页链接 i = 1 urls = [] while i <= int(page_number): print(page_number) print(province_url)
sp = SpecialSpider.SpecialSpider() sp.spider(module, gconfig.special, logger, parser) logger.info('The [%s] process is completed.' % module) exit(0) ### end } conf = gconfig.settings[module] try: f = open(conf['data_path'], 'w', 0) except (IOError), e: logger.warn('Can NOT open file: %s. [Except]: %s' % (conf['data_path'], e)) exit(-1) contents = tools.get_html(module, conf, logger) for content in contents: #{ #content = contains[i] if ('iconv' in conf) and conf['iconv']: content = content.decode('gbk', 'ignore').encode('utf-8') logger.info('convert code success.') # backup if 'save' in conf: try: back_f = open(conf['save'], 'a', 0) back_f.write(content) logger.info('backup success.')
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib from urllib import request, error from lxml import etree import re from tools import get_html url = 'http://www.en8848.com.cn/kouyu/basic/yuanlai/218414.html' html = get_html(url, 'utf-8') # print(html) key = html # 表达式 p = '\$\("\.jp-download"\)\.click\(function\(\){\s*window\.open\(\S.*' # p = 'html' # 我们在编译这段正则表达式 pattern = re.compile(p) # 在源文本中搜索符合正则表达式的部分 matcher1 = re.search(pattern, key) # 打印出来 print(matcher1)