def get_url_lists(): result = set() njit = spider.get_html('http://www.njit.edu.cn') jwc = spider.get_html('http://jwc.njit.edu.cn') nc = spider.njit_catcher(njit) result.update(nc.get_all_link()) time.sleep(1) jwcc = spider.jwc_catcher(jwc) result.update(jwcc.get_all_link()) print(len(result), result) return result
def get(self, request): query_value = request.GET.get('query', None) flag = request.GET.get('flag',None) pageNum=request.GET.get('pageNum',None) last_page=request.GET.get('last_page',None) if query_value and query_value != '': query_value_en = urlquote(query_value) print('query', query_value_en) if flag and pageNum and last_page: rh_list,page_num = get_html(query_value_en,flag=flag,pageNum=pageNum,last_page=last_page) else: rh_list,page_num = get_html(query_value_en) page_size,current_page,maxPageNum = paging_control(page_num,request) SearchHistory.save2SearchHistory(query_value) return render(request, 'search/result.html', {'rh_list': rh_list, 'query_value': query_value, 'last_page':page_size,'current_page':current_page, }) else: return render(request, 'search/index.html', {})
def get_publisher(): clear_file() html = get_html('http://www.szu.edu.cn/board/userlist.asp', 'gb2312') needed_text_pattern = r'>\d+.*?</option>' all_needed_list = re.findall(needed_text_pattern, html) if not all_needed_list: raise Exception('can not find') left_tag, right_tag = (r'>\d+', r'</option>') publisher_list = [] for msg_mixed_with_tags in all_needed_list: row_text = fing_content_between_two_marks(left_tag, right_tag, msg_mixed_with_tags) row_text = row_text[1:] # delete the prefix point publisher_list.append(row_text) return publisher_list
def search(): keywords = request.args.get('q').split(" ") if request.args.get("pn") != None: page = int(request.args.get("pn")) print "aaaaaaaa" else: page = 1 if page == 1: pageup_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page) else: pageup_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page - 1) pagedown_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page + 1) keywords = filter(lambda x: x != "", keywords) html = get_html(keywords, page).encode("utf-8") results = get_results(html) return render_template("search.html", results=results, pageup_url=pageup_url, pagedown_url=pagedown_url)
def get_content(mail, proxies=None): send_list = [] for url in get_url_lists(): text = spider.get_html(url, proxy=proxies) if text is None: print(url + ' cannot load') continue if '无权访问' in text: print(url + ' can only access from local') continue if 'jwc' in url: parser = spider.jwc_parser(text) elif 'xinghuo' in url: parser = spider.xh_parser(text) elif 'www.njit' in url: parser = spider.njit_parser(text) else: print(url, 'cannot find parser') continue page_time = parser.get_time() if now_time == page_time: print(url, ' match time') # if True: title = parser.get_title() body = parser.get_body() send_list.append((page_time + " NJIT:" + title, title + '\n' + body + '\n\n' + url)) time.sleep(1) print(len(send_list), send_list) for send in send_list: title = send[0] body = send[1] mail.send_mail_to(title, body) time.sleep(1)
self.get_njit_link() self.get_info_link() self.get_xh_link() return self.set def get_njit_link(self): p = 'href="content.jsp(.*?)"' return self.parse_link(p, 'http://jwc.njit.edu.cn/content.jsp') def get_info_link(self): p = 'href="http://www.njit.edu.cn/info(.*?)"' return self.parse_link(p, 'http://www.njit.edu.cn/info') def get_xh_link(self): p = 'href="http://xinghuo.njit.edu.cn/info(.*?)"' return self.parse_link(p, 'http://xinghuo.njit.edu.cn/info') if __name__ == '__main__': # r = requests.get('http://www.njit.edu.cn') # r.encoding = 'utf-8' # catcher = njit_catcher(r.text) # links = catcher.get_all_link() # print(len(links), links) import spider text = spider.get_html('http://jwc.njit.edu.cn/') catcher = jwc_catcher(text) link = catcher.get_all_link() print(len(link), link)
def baidu_search(): keyword = request.args.get('wd') return get_html(keyword)