def main(): list1 = [] Parser = argparse.ArgumentParser(description = "WebCrawler") Parser.add_argument('-url',help = 'Initial url', nargs = '?', default = "https://www.python.org", type = str) Parser.add_argument('-depth',help = 'Depth for crawl', nargs = '?', default = 2, type = int) args = Parser.parse_args() argsdict = vars(args) url = argsdict['url'] depth = argsdict['depth'] if url == None or depth == None: print "Arguments empty. Please enter valid url and depth" exit(0) print "Url considered - ",url print "Going for",depth,"number of depths" parse_url.parse_url(url,list1,depth)
def fetch(url, proxy=None, ua=None, cookie=None, verbose=False): buf = cStringIO.StringIO() c = pycurl.Curl() c.setopt(c.URL, url) if cookie: c.setopt(c.COOKIEFILE, cookie) c.setopt(c.COOKIEJAR, cookie) if ua: c.setopt(c.USERAGENT, ua) if proxy: proxy = parse_url(proxy) c.setopt(c.PROXY, proxy['host']) c.setopt(c.PROXYUSERPWD, "%s:%s" % (proxy['user'], proxy['pass'])) if proxy['port']: c.setopt(c.PROXYPORT, int(proxy['port'])) # c.setopt(c.CONNECTTIMEOUT, 10) # c.setopt(c.TIMEOUT, 20) c.setopt(c.FAILONERROR, True) c.setopt(c.VERBOSE, verbose) c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.WRITEFUNCTION, buf.write) c.setopt(c.HTTPHEADER, ['Accept: text/html', 'Accept-Charset: UTF-8']) c.perform() body = buf.getvalue() buf.close() return body
def run(self): url_list = self.get_list_url() for url in url_list: print('请求url', url) pythonJson = parse_url(url) print('请求pythonJson', pythonJson) pageNum = url_list.index(url) + 1 self.save_json(pythonJson, pageNum)
def home(): ha = None if request.method == "POST": url = request.form.get("search") limit = int(request.form.get("limit")) urlParsed = parse_url(url) if urlParsed == -1: ha = {"errors": "Not a valid amazon product url."} else: ha = get_data(urlParsed, limit) return render_template("index.html", text=ha)
def get_content_list(self, url): html_str = parse_url(url) html = etree.HTML(html_str) content_list = [] for div in html.xpath("//div[@id='content-left']"): item = {} item['content'] = div.xpath('.//div[@class="content"]/span/text()') item['author_gender'] = div.xpath( '..//div[contains(@class,"articleGender")]/@class') item['author_gender'] = item['author_gender'][0].split( ' ')[-1].replace( 'Icon', '') if len(item['author_gender']) > 0 else None content_list.append(item) return content_list
def get_img_list(self,detail_url): # 3.1 提取列表页的url地址和标题 # 3.2 请求列表url地址,获取详情页的第一页 if detail_url is not None: detail_html_str = parse_url(detail_url) detail_html = etree.HTML(detail_html_str) # 3.3 提取详情页第一页的图片,提取下一页的地址 img_list = detail_html.xpath('//img[@class="BDE_Image"]/@src') print(detail_html_str) # 3.4 获取详情 else: img_list = [] return img_list
def run(self): num = 0 total = 100 while num < total + 10: # 这个循环条件是非常需要注意的 # 1.设置请求的URL地址 url = self.temp_url.format(num) # 2.解析地址,返回网页源码形式的json字符串 html_str = parse_url(url) print(url) # 3.json字符串转化为字典,提取有用的内容 content_list, total = self.get_content_list(html_str) # 4.把想要的内容再转化为json串,记录到本地或者回写到浏览器 self.save_content_list(content_list) # 5.构造新的URL地址,提取下一单元记录 num += 10
def run(self): # 1.start_url num = 0 total = 100 while num < total + 18: # 2.发送请求 url = self.start_temp(num) json_str = parse_url(url) # 3. 提取数据 content_list, total = self.get_content_list(json_str) # 4. 保存 self.save_content_list(content_list) # if len(content_list < 18): # break # 5.构造下一页url地址 num += 18
def run(self): # 1.遍历url列表 发送请求 num = 0 with open("AmericanTv.json", "w", encoding="utf-8") as f: while num < self.get_total_tvs(): url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start=" + str( num) + "&count=18&loc_id=108288" # 2.获取返回数据 data = parse_url(url) # 3.保存到本地 json.dump(data, f, indent=2, ensure_ascii=False) num = int(num) + 18
def run(self): # 1. start_url pn = 0 take = 50 while pn <= 0: url = self.start_url.format(pn * take) html_str = parse_url(url) content_list = self.get_content_list(html_str) # 2. 发送请求,获取响应 # 3. 提取数据,提取下一页的url地址 # 3.1 提取列表页的url地址和标题 # 3.2 请求列表url地址,获取详情页的第一页 # 3.3 提取详情页第一页的图片,提取下一页的地址 # 3.4 获取详情 # 4.保存 self.save_content_list(content_list) # 5.请求下一页 pn += 1
def main(url1, url2, out_dir): image_url = parse_url(url1, url2) Path(out_dir).mkdir(parents=True, exist_ok=True) file_ext = image_url[-4:] for i in count(1, 1): try: r = requests.get(image_url.format(i), stream=True) if r.status_code == 200: img_file = os.path.join(out_dir, str(i) + file_ext) with open(img_file, 'wb') as f: r.raw.decode_content = True copyfileobj(r.raw, f) else: print('image ' + str(i) + ' not found') break except requests.exceptions.RequestException as e: raise SystemExit(e) else: print('done image ' + str(i))
def fetch(url, proxy = None, ua = None, cookie = None, verbose = True): from parse_url import parse_url import pycurl import cStringIO buf = cStringIO.StringIO() c = pycurl.Curl() c.setopt(c.URL, url) if cookie: c.setopt(c.COOKIEFILE, cookie) c.setopt(c.COOKIEJAR, cookie) if ua: c.setopt(c.USERAGENT, ua) if proxy: proxy = parse_url(proxy) c.setopt(c.PROXY, proxy['host']) c.setopt(c.PROXYPORT, int(proxy['port'])) c.setopt(c.PROXYUSERPWD, "%s:%s" % (proxy['user'], proxy['pass'])) # c.setopt(c.CONNECTTIMEOUT, 10) # c.setopt(c.TIMEOUT, 20) c.setopt(c.FAILONERROR, True) c.setopt(c.VERBOSE, verbose) c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.WRITEFUNCTION, buf.write) c.setopt(c.HTTPHEADER, ['Accept: text/html', 'Accept-Charset: UTF-8']) try: c.perform() status = c.getinfo(c.HTTP_CODE) body = buf.getvalue() buf.close() return {'status': status, 'response': body} except pycurl.error, error: errno, errstr = error return {'status': errno, 'response': errstr}
def main(): url = "http://127.0.0.1:8000/return_json" html_str = parse_url(url) ret1 = json.loads(html_str) # pprint(ret1) # with open("douban.json","w",encoding="utf-8") as f: # f.write(json.dumps(ret1,ensure_ascii=False,indent=2)) # # with open("douban.json","r",encoding="utf-8") as f: # ret2 = f.read() # ret3 = json.loads(ret2) # print(ret3) #使用json,load提取类文件对象中的数据 with open("douban.json", "r", encoding="utf-8") as f: ret4 = json.load(f) print(ret4) print(type(ret4)) # 使用json,dump能够把python类型放入类文件对象中 with open("douban1.json", "w", encoding="utf-8") as f: json.dump(ret1, f, ensure_ascii=False, indent=2)
df = get_reports.get_report(report, client, ids, downloader) dictionary = parse_url.init_dictionary(dictfile) elapsed = (time.time() - start)/60 print "Run time: " + str(elapsed) start = time.time() print "Creating headers" df, urlparams = parse_url.create_headers(df) elapsed = (time.time() - start)/60 print "Run time: " + str(elapsed) start = time.time() print "Parsing URL" df = parse_url.parse_url(df) elapsed = (time.time() - start)/60 print "Run time: " + str(elapsed) start = time.time() print "Checking values" df = parse_url.typo_correct(urlparams, df, dictionary) today = "".join(l for l in str(date.today()) if l not in string.punctuation and l not in " ") filename = report['reportType'] + today + '.csv' df.to_csv(filename) elapsed = (time.time() - start)/60 print "Run time: " + str(elapsed) total_elapsed = (time.time() - total_start)/60
import re, json from parse_url import parse_url url = 'https://36kr.com/' html_str = parse_url(url) ret = re.findall('<script>var props=(.*?),locationnal=', html_str)[0] with open('36kr.json', 'w', encoding='utf-8') as f: f.write(ret) ret = json.loads(ret) print(ret)
import json from parse_url import parse_url from pprint import pprint url = "https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?start=0&count=18&loc_id=108288" # url = "https://www.baidu.com" str = parse_url(url) # 使用json.loads把json字符串转化为python类型 ret = json.loads(str) # pprint(ret1) # print(type(ret1)) # json.dumps能够把python类型转化为json字符串,写入的时候必须要是字符串 with open("douban.json", "w", encoding="utf-8") as f: f.write(json.dumps(ret, ensure_ascii=False, indent=2)) # 显示中文格式,换行并且空两格 with open("douban.json", "r") as f: ret2 = f.read() ret3 = json.loads(ret2) print(ret3) print(type(ret3)) #使用json.load提取类对象中的数据 with open("douban.json", "r", encoding="utf-8") as f: ret4 = json.load(f) print(ret4) print(type(ret4)) #使用json.dump能够把python类型放入类文件对象中 with open("douban.json", "w", encoding="utf-8") as f: json.dump(ret4, f, ensure_ascii=False, indent=2)
def get_total(self): html_str = parse_url(self.url) # json.loads把json字符串转化为python类型 ret1 = json.loads(html_str) total = ret1["total"] return total
urllib3.disable_warnings() import requests import json import re from parse_url import parse_url proxies = { 'http': 'http://110.86.137.0:9999', 'https': 'http://110.86.137.0:9999' } ex_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag={}&page_limit=1000&page_start=0' url = 'https://movie.douban.com/j/search_tags?type=movie&source=index' response_dict = json.loads(parse_url(url)) move_name_list = response_dict["tags"] print(move_name_list) url_list = [ex_url.format(i) for i in move_name_list] print(url_list) for url in url_list: html_str = parse_url(url) print(html_str) # 将json数据转换成python数据 response = json.loads(html_str) print(re.findall(r'tag=(.*?)&', url))
def data(self, name): for url in self.L_url: self.html_str = parse_url(url) ret = json.loads(self.html_str) with open(name, "a", encoding="utf-8") as f: f.write(json.dumps(ret, ensure_ascii=False, indent=4))
def get_total_tvs(self): """ 获取总记录数据 """ data = json.loads(parse_url(self.start_url)) return int(data["total"])
# coding=utf-8 import json import re from parse_url import parse_url url = "http://36kr.com/" headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36", } htmlStr = parse_url(url, headers) ret = re.findall("<script>var props=(.*?),locationnal=", htmlStr)[0] with open("./res/36kr.json", "w", encoding='utf-8') as f: f.write(ret) ret = json.loads(ret) print(ret)