def retrieve_pages(): url = "https://future-students.uq.edu.au/study/find-a-program/listing/undergraduate" page = jquery(url=url) retrieve_page(page) url = "https://future-students.uq.edu.au/study/find-a-program/listing/postgraduate" page = jquery(url=url) retrieve_page(page)
def parseComment(questionNum, commentItem): if commentItem is None: print(u'评论标签不存在') return commentJq = jquery(commentItem) # 机构 organization = commentJq.find(".div1 .p1 span a").html() # 代理人 proxy = commentJq.find(".div1 .p1 a span").html() # 回复内容 replys = commentJq.find(".div2 p") if replys is None: print(u'评论回复为空') return # 解析回复内容 proxyReply = replys[0] if proxyReply is None: print(u'无代理人回复') return proxyReplyJq = jquery(proxyReply) proxyReplyComment = proxyReplyJq.html() # 解析手机号 proxyReplyComment = cnToNum(proxyReplyComment) proxyReplyComment = proxyReplyComment.replace(u"-", "") proxyReplyComment = proxyReplyComment.replace(u"_", "") proxyReplyComment = proxyReplyComment.replace(u"—", "") phone = parsePhone(proxyReplyComment) saveHandle(questionNum, { "proxy": proxy, "organization": organization, "phone": phone }) return
def GetBrandData(self): html = rq.get(self._url) doc = jquery(html) brandJqs = doc.find('.super-mod') allNum = brandJqs.length # 总数量 print('解析出总的品牌数据量:%s' % allNum) scNum = 0 # 成功数据量 for brandItem in brandJqs: brandJq = jquery(brandItem) if (len(brandJq.find('.mod-intro')) == 0): print('解析没有mod-intro标签') continue scNum += 1 print(brandJq.find('.mod-intro').html()) print('成功获取的品牌数据数量:%s,解析失败数量:%s' % (scNum, allNum - scNum))
def GetBrandData(self): rs = urllib2.urlopen(self._url, timeout=10) html = rs.read().decode('utf-8') doc = jquery(html) brandJqs = doc.find('.super-mod') allNum = brandJqs.length # 总数量 print('解析出总的品牌数据量:%s' % allNum) scNum = 0 # 成功数据量 for brandItem in brandJqs: brandJq = jquery(brandItem) if (len(brandJq.find('.mod-intro')) == 0): print('解析没有mod-intro标签') continue scNum += 1 print brandJq.find('.mod-intro').html() print('成功获取的品牌数据数量:%s,解析失败数量:%s' % (scNum, allNum - scNum))
def retrieve_program_page(program_page): if program_page.find("a.green") is not None: try: courselist_url = host + program_page.find("a.green").attr("href") print courselist_url courselist_page = jquery(url=courselist_url) retrieve_course_list_page(courselist_page) except Exception, err: print err
def retrieve_course_list_page(courselist_page): courses = courselist_page.find('tr>td:first>a') index = 0 while index < courses.size(): course_url = host + courses.eq(index).attr("href") course_page = jquery(course_url) print courses.eq(index).text() retrieve_course_page(course_page, courses.eq(index).text()) index += 1
def retrieve_page(qaPage): baseurl = 'https://uqfuture.custhelp.com' listHolder = qaPage.find('.rn_Content') all_link = listHolder.find('a') for li in all_link: if li.text: url = baseurl + li.attrib['href'] answerPage = jquery(url=url) retrieve_answer_page(answerPage=answerPage)
def retrieve_pages(): max = 43 baseurl = "https://uqfuture.custhelp.com/app/answers/list/st/4/page/" index = 1 while (index <= max): print(index) url = baseurl + str(index) page = jquery(url=url) retrieve_page(page) index += 1
def getArticleContent(url): ''' 获取文章博文内容 ''' # show-content if (not url): print('非法地址') return htmlStr = rq.get(url) jq_dom = jquery(htmlStr) jq_content = jq_dom.find('.show-content') content_html = jq_content.html() # print(content_html) return content_html
def fetch_course(url): courselist_page = jquery(url=url) courses = courselist_page.find('tbody>tr>td:nth-child(3)') result = [] maxItem = 50 index = 0 for course in courses.items(): index += 1 print(course.text()) if 'Alrady' not in course.text(): result.append(course.text()) if index >= maxItem: return ','.join(result) + ". If you want know more courses, please visit UQ website to check detail." return ','.join(result)
def retrieve_page(page): program_lists = page.find(".plan") first = True print program_lists.size() index = 1 while index < program_lists.size(): program = program_lists.eq(index) index += 1 if program.text() != "": print program.text() program_url = host + program.find('a').attr("href") print program_url try: program_page = jquery(url=program_url) retrieve_program_page(program_page) except: print "error"
def getComment(questionNum): url = u'http://www.bxd365.com/qa/%s.html' % questionNum print(u"开始解析:%s" % url) try: html = rq.get(url) except Exception as e: print(e) return doc = jquery(html) replys = doc.find(".reply li") if replys is None: print(u'无评论数据') return if len(replys) <= 0: print(u'评论数量为0') return for item in replys: parseComment(questionNum, item)
def retrieve_program_page(url): driver.get(url=url) driver.save_screenshot('screenshot.png') try: if driver.find_element_by_xpath( "//a[text()=\"I'm an international student\"]"): driver.find_element_by_xpath( "//a[text()=\"I'm an international student\"]").click() except: print("Already click this button") try: head = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//h1'))) page = jquery( driver.find_element_by_xpath("//html").get_attribute( 'innerHTML').replace('>', '>')) print(head.text) head = clean_text(head.text) print(head) location = clean_text( driver.find_element_by_xpath( "//span[@data-sinet='LOCATION']").text) duration = clean_text( driver.find_element_by_xpath( "//div[@class='program__duration-value']").text) commencing = clean_text( driver.find_element_by_xpath( "//div[@class='program__commencement-value']").text) print(location) print(duration) print(commencing) # fee of the program driver.find_element_by_xpath( "//a[text()='Fees and scholarships']").click() fee = page.find( 'span[data-sinet="StudentInfo > Domestic > IndicativeFee > CSP"]' ).text() fee = clean_text(fee) print(fee) # major or the program majors = [] majorsElements = page.find('h3[data-sinet="[Plan] TITLE"]') for majorsElement in majorsElements.items(): print(majorsElement.text()) majors.append(clean_text(majorsElement.text())) majors = ','.join(majors) # summary of the program program_code = clean_text( page.find( "ul[class='program__table'] div[data-sinet='CODE']").text()) program_unit = clean_text( page.find( "ul[class='program__table'] div[data-sinet='UNITS']").text()) program_level = clean_text( page.find( "ul[class='program__table'] div[data-sinet='LEVEL_VALUE']"). text()) program_faculty = clean_text( page.find( "ul[class='program__table'] div[data-sinet='Faculty > FACULTY_KEY']" ).text()) print(program_code) print(program_unit) print(program_level) print(program_faculty) course_url = page.find('#program-structure > a:nth-child(2)') course_url = course_url.attr['href'] print(course_url) courses = "The course list is still not available" if course_url is not None: courses = fetch_course(course_url) entry_requirements = page.find('#entry-requirements') entry_requirements = clean_text(entry_requirements.text().replace( 'Entry requirements ', '')) print(entry_requirements) connection.cursor().execute( '''INSERT into program_international (title, location, duration, commencing, fee, majors, program_code, program_unit, program_level, program_faculty, courses, entry_requirements) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', (head, location, duration, commencing, fee, majors, program_code, program_unit, program_level, program_faculty, courses, entry_requirements)) connection.commit() except TimeoutException: print('No such program for international student')
def retrieve_pages(): baseurl = "http://www.uq.edu.au/departments/unit_types.html?type=5" page = jquery(url=baseurl) retrieve_page(page)
def retrieve_pages(): url = "http://www.uq.edu.au/events/calendar_view.php?category_id=16" page = jquery(url=url) retrieve_page(page)
def retrieve_pages(): baseurl = "http://www.uq.edu.au/maps/mapindex.html?menu=1" page = jquery(url=baseurl) retrieve_page(page)
dir_path = os.path.split(os.path.realpath(__file__))[0] sys.path.append(dir_path + "/..") import common.request as rq proxyDatas = [] def getComment(questionNum): url = u'http://www.bxd365.com/qa/%s.html' % questionNum print u"开始解析:%s" % url try: html = rq.get(url) except Exception, e: print e return doc = jquery(html) replys = doc.find(".reply li") if replys is None: print u'无评论数据' return if len(replys) <= 0: print u'评论数量为0' return for item in replys: parseComment(questionNum, item) def parseComment(questionNum, commentItem): if commentItem is None: print u'评论标签不存在' return
def get_agencys(city, page_index=1): ''' 获取58全职搜索保险代理人信息 * 'city_type' 城市类型 * 'page_index' 当前页码 ''' city_type = city['id'] hp.print_partition(u'解析城市:%s-%s-%s,保险代理人的工作' % (city['province'], city['city'], city['id'])) # 构造接口url url = __agencys_url.format(city_type, page_index) print(u'工作地址:%s' % url) try: # html = rq.get_cookie( # url, # headers={ # "User-agent": # "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", # "Referer": # url # }) html = rq.get(url) except Exception as e: print(e) return doc = jquery(html) if doc is None: print(u"解析html报错") return # 总页数 page_nums_str = doc.find(".num_operate .total_page").html() if page_nums_str is None: page_nums_str = '0' page_nums = int(page_nums_str) print(u'总页数:%s,当前页码:%s' % (page_nums, page_index)) # 工作列表 list_jobs = doc.find("#list_con .job_item") if list_jobs is None: print(u"没有查询到工作列表信息") return print(u"工作总数:%s" % len(list_jobs)) # 遍历工作 today_nums = 0 for job_item in list_jobs: job_item_jq = jquery(job_item) job_sign = job_item_jq.find(".sign").html() if not check_job_istoday(job_sign): print(u'状态:%s,不是今日发布,略过' % job_sign) continue today_nums = today_nums + 1 job_name = job_item_jq.find(".name").html() if job_name.find(u'保险') < 0: print(u'工作:%s,非保险类工作,略过~' % job_name) continue job_address = job_item_jq.find(".address").html() job_url = job_item_jq.find("a").attr("href") job_company = job_item_jq.find(".job_comp .comp_name .fl").attr( "title") job_company = analysis_job_company(job_company) print(u'%s|%s|%s' % (job_address, job_name, job_sign)) # 延迟 hp.sleep(0.3, 0.6, content=u'获取工作详情=》') # 解析job_data数据 job_data = analysis_job_data(job_url) if job_data is None or job_data['pagenum'] is None: print(u'无法获取pagenum,略过!') continue __jobs.append({ "name": job_name, "address": job_address, "url": job_url, "pagenum": job_data['pagenum'], "contactPerson": job_data['contactPerson'], "sign": job_sign, "company": job_company }) print(u'当前页码:%s,总页数:%s' % (page_index, page_nums)) # 校验是否需要继续翻页 if today_nums <= 0: print('当前页码:%s,无今日工作,无需继续翻页' % (page_index)) return page_index = page_index + 1 # 递归翻页 if page_index <= page_nums: print(' ') # 延迟 hp.sleep(0, 1, content=u'翻页=》') get_agencys(city, page_index)
def retrieve_page(qaPage): base_url = 'http://www.uq.edu.au/departments/' all_links = qaPage.find('#content-primary>a') for link in all_links.items(): url = base_url + link.attr['href'] retrieve_school_page(jquery(url=url))
url = __agencys_url.format(city_type, page_index) print u'工作地址:%s' % url try: # html = rq.get_cookie( # url, # headers={ # "User-agent": # "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", # "Referer": # url # }) html = rq.get(url) except Exception, e: print e return doc = jquery(html) if doc is None: print u"解析html报错" return # 总页数 page_nums_str = doc.find(".num_operate .total_page").html() if page_nums_str is None: page_nums_str = '0' page_nums = int(page_nums_str) print u'总页数:%s,当前页码:%s' % (page_nums, page_index) # 工作列表 list_jobs = doc.find("#list_con .job_item") if list_jobs is None: print u"没有查询到工作列表信息" return print u"工作总数:%s" % len(list_jobs)
def retrieve_pages(): url = "https://www.uq.edu.au/study/browse.html?level=ugpg" page = jquery(url=url) retrieve_page(page)
def specialArticles(key, source, page=1): ''' 获取主题中的文章信息列表 * 'key' 主题Key * 'page' 文章页码 ''' url = __special_newlike_url.format(key) htmlStr = rq.get(url) if (not htmlStr): print(u'获取html失败') return jq_dom = jquery(htmlStr) if (not jq_dom): print(u'无法解析页面dom') return dom_contents = jq_dom.find('.content') if (not dom_contents): print(u'无法解析文章内容') return articles = [] for item in dom_contents: jq_content_item = jquery(item) dom_title = jq_content_item.find('.title') dom_time = jq_content_item.find('.time') dom_read = jq_content_item.find('.ic-list-read') dom_comments = jq_content_item.find('.ic-list-comments') dom_like = jq_content_item.find('.ic-list-like') if (not dom_title): print(u'无法解析 title') continue if (not dom_time): print(u'无法解析 time') continue # 解析文章信息 article_read = int(dom_read.parent().text()) article_comments = int(dom_comments.parent().text()) article_like = int(dom_like.parent().text()) article_title = dom_title.html() artitle_href = dom_title.attr('href') artitle_time = dom_time.attr('data-shared-at').replace( '-', ' ').replace('+08:00', '').replace('T', ' ') # artitle_time = time.strptime(artitle_time, '%Y %m %d %H:%M:%S') article_url = '{host}{href}'.format( host=__jianshu_host, href=artitle_href) print(u'获得文章:', hp.remove_emoji(), article_title, article_url, artitle_time) if (article_read < 100): print(u'文章阅读量<100,不爬取') continue if (article_like < 1): print(u'文章收藏量<10,不爬取') continue if (article_comments < 1): print(u'文章评论量<3,不爬取') continue # 获取文章内容 content_html = getArticleContent(article_url) if (not content_html): print(u'无法获取博文内容') continue # 文章内容字符串处理 content_html = content_html.replace('data-original-', '') content_markdown = getCotentMarkDown(content_html) # markdown内容字符串处理 # content_markdown = content_markdown.replace("|", "-") articles.append({ 'title': article_title, 'url': article_url, 'time': artitle_time, 'source': source, 'content': content_markdown }) return articles