def crawling(self): curl = Curl() curl.set_url(self.url) with open(self.filname, "wb") as output: curl.set_option(pycurl.WRITEFUNCTION, output.write) curl.get() curl.close()
def crawling(self): curl = Curl() curl.set_url(self.url) with open(self.filname,"wb") as output: curl.set_option(pycurl.WRITEFUNCTION,output.write) curl.get() curl.close()
def get_zip(self, url, filename): fp = open(filename, "wb") c = Curl() c.get(url, ) c.set_option(c.WRITEDATA, fp) c.perform() c.close() fp.close()
def saveFile2Local(self,url): self.getFileNameByUrl(url) if self.filename: with open(self.filename,"wb") as output: curl = Curl() curl.set_url(url) curl.set_option(pycurl.WRITEFUNCTION,output.write) curl.get() curl.close() Log4Spider.downLog(self,"downloaded a file:[[[",self.filename,"]]]")
def crawlCraigz(iterations): global cities global links if get_counter(0) is not 0 and iterations*120 != get_counter(0): return cl = Curl(base_url="", fakeheaders=[ 'Cookie: cl_b=5GEZ9Y0F6RGXNBZ5tq5GrwngXVs; cl_def_hp=dallas', ]) page = cl.get("http://dallas.craigslist.org/search/roo",{ 's': get_counter(120), 'search_distance': 13, 'postal': 75214, 'min_price': 400, 'max_price': 600, 'availabilityMode': 0, }) doc = lhtml.document_fromstring(page) for l in doc.iterlinks(): for c in cities: linktext = l[2] linktext = linktext[14::] if c in str(l[0].text) or c.lower() in linktext: links.append(l[2]+'\n') print(l[2]) return crawlCraigz(iterations)
def check_projects(self,projects): url = 'curl http://localhost:6800/listprojects.json' curl = Curl(url) res = curl.get() rlt = json.loads(res) for r in rlt['projects']: if r == 'soufun_s2': return True return False
def fetch_url(url, nobody=0, timeout=30, follow_redirect=0, agent=USER_AGENT): """Fetch url using curl :param url: :param nobody: :param timeout: :param follow_redirect: :param agent: """ t = io.StringIO() c = Curl() s = r"%s" % (url) c.set_option(pycurl.USERAGENT, agent) c.set_option(pycurl.URL, s.encode('utf-8')) c.set_option(pycurl.NOBODY, nobody) c.set_option(pycurl.FOLLOWLOCATION, follow_redirect) c.set_option(pycurl.WRITEFUNCTION, t.write) c.set_option(pycurl.TIMEOUT, timeout) attempt = 0 try: c.get() except: return (None, None) return (c, t)
if self.infos == 5 and 'beliked' not in self.info.keys(): self.info['beliked'] = int(data) def handle_endtag(self, tag): if tag == "h3": self.h3 = 0 if self.clearfix and tag == "ul": self.clearfix = 0 if hasattr(self, "infoHook"): self.infoHook(self.info) def handle_startendtag(self, tag, attrs): pass @property def urlList(self): return self.current_urlList() if __name__ == "__main__": parser = JianShuUserInfo_HtmlParser() from curl import Curl import pycurl c = Curl() c.set_url("http://www.jianshu.com/users/d9edcb44e2f2/latest_articles") data = c.get() #parser.setParseFile("parse.txt") parser.setInfoHook(lambda info: print(str(info))) parser.feed(data.decode("utf-8")) parser.close() c.close()
from curl import Curl curl = Curl() url = 'http://www.baidu.com/' res = curl.get() print res
from curl import Curl import os import sys if __name__ == "__main__": if len(sys.argv) < 2: url = 'http://curl.haxx.se' else: url = sys.argv[1] c = Curl() c.get(url) print c.body() print '=' * 74 + '\n' import pprint pprint.pprint(c.info()) print c.get_info(pycurl.OS_ERRNO) print c.info()['os-errno'] c.close()
def pycurl_detection(url, ip): ''' 探测程序 :param url: 请求地址 :param ip: dig ip :return: 状态码,响应时间 ''' try: domain = getDomain(url) # protol=getProtol(url) # path = getPath(url) # print("="*74+"pycurl_detect ion\n") new_url = url.replace(domain, ip) # print("url:"+url,"ip:"+ip) # print("domain:"+domain) # print("path:"+path) # print("new_url:"+new_url) header = [ 'GET %s HTTP/1.1' % path, 'Host: %s' % domain, 'Accept: */*', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding: gzip, deflate', 'Connection: keep-alive', 'Cache-Control: no-cache', 'User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', ] if url.find("baidu.com") > 0: header.pop(len(header) - 1) c = Curl(fakeheaders=header) c.get(new_url) http_code = c.get_info(pycurl.HTTP_CODE) # 返回的HTTP状态码 # print("返回的HTTP状态码:%s"%http_code) size_download = c.get_info(pycurl.SIZE_DOWNLOAD) # 下载数据包大小 speed_download = c.get_info(pycurl.SPEED_DOWNLOAD) # 平均下载速度 file_time = c.get_info(pycurl.INFO_FILETIME) # 检索文档的远程时间 namelookup_time = c.get_info(pycurl.NAMELOOKUP_TIME) # DNS解析所消耗的时间 content_time = c.get_info(pycurl.CONNECT_TIME) # 建立连接所消耗的时间 pretransfer_time = c.get_info( pycurl.PRETRANSFER_TIME) # 从建立连接到准备传输所消耗的时间 starttransfer_time = c.get_info( pycurl.STARTTRANSFER_TIME) # 从建立连接到传输开始消耗的时间 total_time = c.get_info(pycurl.TOTAL_TIME) # 传输结束所消耗的总时间 redirect_time = c.get_info(pycurl.REDIRECT_TIME) # 重定向所消耗的时间 redirect_url = c.get_info(pycurl.REDIRECT_URL) # 重定向url redirect_count = c.get_info(pycurl.REDIRECT_COUNT) # 重定向次数 primary_ip = '' # c.get_info(pycurl.PRIMARY_IP) primary_port = '' # c.get_info(pycurl.PRIMARY_PORT) local_ip = '' #c.get_info(pycurl.LOCAL_IP) local_port = '' # c.get_info(pycurl.LOCAL_PORT) info = c.info() header = c.header() str = ''' url:%s,ip:%s,size_download:%s,speed_download:%s,file_time:%s,redirect_count:%s, namelookup_time:%s,content_time:%s,pretransfer_time:%s,starttransfer_time:%s,total_time:%s,redirect_time:%s redirect url:%s,count:%s primary ip:%s,port:%s local ip:%s,port:%s info:%s ''' % (url, ip, size_download, speed_download, file_time, redirect_count, namelookup_time, content_time, pretransfer_time, starttransfer_time, total_time, redirect_time, redirect_url, redirect_count, primary_ip, primary_port, local_ip, local_port, info) print(str) ''' #print("传输结束所消耗的总时间:%s" % total_time) namelookup_time=c.get_info(pycurl.NAMELOOKUP_TIME) # DNS解析所消耗的时间 #print("DNS解析所消耗的时间:%s" % namelookup_time) content_time=c.get_info(pycurl.CONNECT_TIME) # 建立连接所消耗的时间 #print("建立连接所消耗的时间:%s" % content_time) pretransfer_time=c.get_info(pycurl.PRETRANSFER_TIME) # 从建立连接到准备传输所消耗的时间 #print("从建立连接到准备传输所消耗的时间:%s" % pretransfer_time) starttransfer_time=c.get_info(pycurl.STARTTRANSFER_TIME) # 从建立连接到传输开始消耗的时间 #print("从建立连接到传输开始消耗的时间:%s" % starttransfer_time) redirect_time=c.get_info(pycurl.REDIRECT_TIME) # 重定向所消耗的时间 #print("重定向所消耗的时间:%s" % redirect_time) size_upload=c.get_info(pycurl.SIZE_UPLOAD) # 上传数据包大小 size_download=c.get_info(pycurl.SIZE_DOWNLOAD) # 下载数据包大小 speed_download=c.get_info(pycurl.SPEED_DOWNLOAD) # 平均下载速度 speed_upload=c.get_info(pycurl.SPEED_UPLOAD) # 平均上传速度 header_size=c.get_info(pycurl.HEADER_SIZE) # HTTP头部大小 print(c.body()) print('=' * 74 + '\n') print(c.header()) print('=' * 74 + '\n') import pprint pprint.pprint(c.info()) print(c.get_info(pycurl.OS_ERRNO)) print(c.info()['os-errno']) ''' except Exception as e: str = "def pycurl_detection(%s,%s) Exception %s" % (url, ip, e.args) print(str) logging.exception(str) return -1, -1, 0, 0, 0 finally: c.close() return http_code, "%.3f" % total_time, size_download, speed_download, redirect_count
__author__ = 'zhangxa' from curl import Curl import pycurl from html.parser import HTMLParser from htmlParser.htmlParser import UrlHtmlParser from download.downFile import DownFile from urlHandler.urlHandler import UrlBaseHandler from urlQueue.urlQueue import UrlQueue start_url = "http://www.pcgames.com.cn/" c = Curl() c.set_url(start_url) data = c.get() info = c.info() #print(info) def get_charset(c_type): charset=None try: if c_type and 'charset' in c_type: start = c_type.find('charset=') charset_str = c_type[start:] end = charset_str.find(' ') if end > -1: charset = charset_str[len('charset='):end] else: charset = charset_str[len('charset='):] except: return 'UTF-8'