def get_haha365_urls(): list_url = r'http://www.haha365.com/xd_joke/index.htm' base_url = r'http://www.haha365.com' getter = HtmlSourceGetter() html_data = getter.get_uniform_html_source(list_url, coding[1]) pattern = r'<img src="/Pic/02.gif"><a Class="" target="_blank" href="(.*?)" >' uris = re.findall(pattern, html_data['data']) result_urls = [] for uri in uris: result_urls.append(base_url + uri) return (result_urls, coding[1])
def get_raw_html(): """第一步,取得统一编码的源文件到本地目录下""" getter = HtmlSourceGetter() (urls, coding) = get_url_list() # get data html_data = {} for url in urls: html_data[url] = getter.get_uniform_html_source(url, coding) # output raw data create_dir(out_source_dir_) i = 0 for url, raw_data in html_data.items(): if not raw_data['data']: continue raw_file = open(out_source_dir_ + create_file_name(today, site_name, i + 1, '.ymg_html'), 'w') raw_file.write(raw_data['request_url']) raw_file.write(u'\n\n') raw_file.write(raw_data['data'].replace('\r\n', '')) raw_file.close() i += 1 return i