Exemplo n.º 1
0
 def crawling(self):
     curl = Curl()
     curl.set_url(self.url)
     with open(self.filname, "wb") as output:
         curl.set_option(pycurl.WRITEFUNCTION, output.write)
         curl.get()
         curl.close()
Exemplo n.º 2
0
 def crawling(self):
     curl = Curl()
     curl.set_url(self.url)
     with open(self.filname,"wb") as output:
         curl.set_option(pycurl.WRITEFUNCTION,output.write)
         curl.get()
         curl.close()
Exemplo n.º 3
0
 def get_zip(self, url, filename):
     fp = open(filename, "wb")
     c = Curl()
     c.get(url, )
     c.set_option(c.WRITEDATA, fp)
     c.perform()
     c.close()
     fp.close()
Exemplo n.º 4
0
 def saveFile2Local(self,url):
     self.getFileNameByUrl(url)
     if self.filename:
         with open(self.filename,"wb") as output:
             curl = Curl()
             curl.set_url(url)
             curl.set_option(pycurl.WRITEFUNCTION,output.write)
             curl.get()
             curl.close()
             Log4Spider.downLog(self,"downloaded a file:[[[",self.filename,"]]]")
Exemplo n.º 5
0
def crawlCraigz(iterations):
	global cities
	global links
	
	if get_counter(0) is not 0 and iterations*120 != get_counter(0):
		return
	
	cl = Curl(base_url="", fakeheaders=[
		'Cookie: cl_b=5GEZ9Y0F6RGXNBZ5tq5GrwngXVs; cl_def_hp=dallas',
		])
	
	page = cl.get("http://dallas.craigslist.org/search/roo",{
		's': get_counter(120),
		'search_distance': 13,
		'postal': 75214,
		'min_price': 400,
		'max_price': 600,
		'availabilityMode': 0,
		})

	doc = lhtml.document_fromstring(page)
	for l in doc.iterlinks():
		for c in cities:
			linktext = l[2]
			linktext = linktext[14::]
			if c in str(l[0].text) or c.lower() in linktext:
				links.append(l[2]+'\n')
				print(l[2])
	
	return crawlCraigz(iterations)
Exemplo n.º 6
0
    def check_projects(self,projects):

        url = 'curl http://localhost:6800/listprojects.json'
        curl = Curl(url)
        res = curl.get()
        rlt = json.loads(res)
        for r in rlt['projects']:
            if r == 'soufun_s2':
                return True
        return False
Exemplo n.º 7
0
def fetch_url(url, nobody=0, timeout=30, follow_redirect=0, agent=USER_AGENT):
    """Fetch url using curl
    :param url:
    :param nobody:
    :param timeout:
    :param follow_redirect:
    :param agent:
    """
    t = io.StringIO()
    c = Curl()
    s = r"%s" % (url)
    c.set_option(pycurl.USERAGENT, agent)
    c.set_option(pycurl.URL, s.encode('utf-8'))
    c.set_option(pycurl.NOBODY, nobody)
    c.set_option(pycurl.FOLLOWLOCATION, follow_redirect)
    c.set_option(pycurl.WRITEFUNCTION, t.write)
    c.set_option(pycurl.TIMEOUT, timeout)
    attempt = 0
    try:
        c.get()
    except:
        return (None, None)
    return (c, t)
Exemplo n.º 8
0
            if self.infos == 5 and 'beliked' not in self.info.keys():
                self.info['beliked'] = int(data)

    def handle_endtag(self, tag):
        if tag == "h3":
            self.h3 = 0
        if self.clearfix and tag == "ul":
            self.clearfix = 0
            if hasattr(self, "infoHook"):
                self.infoHook(self.info)

    def handle_startendtag(self, tag, attrs):
        pass

    @property
    def urlList(self):
        return self.current_urlList()


if __name__ == "__main__":
    parser = JianShuUserInfo_HtmlParser()
    from curl import Curl
    import pycurl
    c = Curl()
    c.set_url("http://www.jianshu.com/users/d9edcb44e2f2/latest_articles")
    data = c.get()
    #parser.setParseFile("parse.txt")
    parser.setInfoHook(lambda info: print(str(info)))
    parser.feed(data.decode("utf-8"))
    parser.close()
    c.close()
Exemplo n.º 9
0
from curl import Curl

curl = Curl()
url = 'http://www.baidu.com/'
res = curl.get()
print res
Exemplo n.º 10
0
from curl import Curl
import os
import sys
if __name__ == "__main__":
    if len(sys.argv) < 2:
        url = 'http://curl.haxx.se'
    else:
        url = sys.argv[1]
    c = Curl()
    c.get(url)
    print c.body()
    print '=' * 74 + '\n'
    import pprint
    pprint.pprint(c.info())
    print c.get_info(pycurl.OS_ERRNO)
    print c.info()['os-errno']
    c.close()
Exemplo n.º 11
0
def pycurl_detection(url, ip):
    '''
        探测程序
        :param url: 请求地址
        :param ip: dig ip
        :return: 状态码,响应时间
    '''
    try:
        domain = getDomain(url)
        # protol=getProtol(url)
        # path = getPath(url)

        # print("="*74+"pycurl_detect ion\n")
        new_url = url.replace(domain, ip)
        # print("url:"+url,"ip:"+ip)
        # print("domain:"+domain)
        # print("path:"+path)
        # print("new_url:"+new_url)

        header = [
            'GET %s HTTP/1.1' % path,
            'Host: %s' % domain,
            'Accept: */*',
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding: gzip, deflate',
            'Connection: keep-alive',
            'Cache-Control: no-cache',
            'User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
        ]
        if url.find("baidu.com") > 0:
            header.pop(len(header) - 1)
        c = Curl(fakeheaders=header)
        c.get(new_url)
        http_code = c.get_info(pycurl.HTTP_CODE)  # 返回的HTTP状态码
        # print("返回的HTTP状态码:%s"%http_code)

        size_download = c.get_info(pycurl.SIZE_DOWNLOAD)  # 下载数据包大小
        speed_download = c.get_info(pycurl.SPEED_DOWNLOAD)  # 平均下载速度
        file_time = c.get_info(pycurl.INFO_FILETIME)  # 检索文档的远程时间

        namelookup_time = c.get_info(pycurl.NAMELOOKUP_TIME)  # DNS解析所消耗的时间
        content_time = c.get_info(pycurl.CONNECT_TIME)  # 建立连接所消耗的时间
        pretransfer_time = c.get_info(
            pycurl.PRETRANSFER_TIME)  # 从建立连接到准备传输所消耗的时间
        starttransfer_time = c.get_info(
            pycurl.STARTTRANSFER_TIME)  # 从建立连接到传输开始消耗的时间
        total_time = c.get_info(pycurl.TOTAL_TIME)  # 传输结束所消耗的总时间
        redirect_time = c.get_info(pycurl.REDIRECT_TIME)  # 重定向所消耗的时间

        redirect_url = c.get_info(pycurl.REDIRECT_URL)  # 重定向url
        redirect_count = c.get_info(pycurl.REDIRECT_COUNT)  # 重定向次数

        primary_ip = ''  # c.get_info(pycurl.PRIMARY_IP)
        primary_port = ''  # c.get_info(pycurl.PRIMARY_PORT)
        local_ip = ''  #c.get_info(pycurl.LOCAL_IP)
        local_port = ''  # c.get_info(pycurl.LOCAL_PORT)

        info = c.info()

        header = c.header()
        str = '''
        url:%s,ip:%s,size_download:%s,speed_download:%s,file_time:%s,redirect_count:%s,
        namelookup_time:%s,content_time:%s,pretransfer_time:%s,starttransfer_time:%s,total_time:%s,redirect_time:%s
        redirect url:%s,count:%s
        primary ip:%s,port:%s
        local ip:%s,port:%s
        info:%s
        ''' % (url, ip, size_download, speed_download, file_time,
               redirect_count, namelookup_time, content_time, pretransfer_time,
               starttransfer_time, total_time, redirect_time, redirect_url,
               redirect_count, primary_ip, primary_port, local_ip, local_port,
               info)
        print(str)
        '''
        #print("传输结束所消耗的总时间:%s" % total_time)
        namelookup_time=c.get_info(pycurl.NAMELOOKUP_TIME)  # DNS解析所消耗的时间
        #print("DNS解析所消耗的时间:%s" % namelookup_time)
        content_time=c.get_info(pycurl.CONNECT_TIME)  # 建立连接所消耗的时间
        #print("建立连接所消耗的时间:%s" % content_time)
        pretransfer_time=c.get_info(pycurl.PRETRANSFER_TIME)  # 从建立连接到准备传输所消耗的时间
        #print("从建立连接到准备传输所消耗的时间:%s" % pretransfer_time)
        starttransfer_time=c.get_info(pycurl.STARTTRANSFER_TIME)  # 从建立连接到传输开始消耗的时间
        #print("从建立连接到传输开始消耗的时间:%s" % starttransfer_time)
        redirect_time=c.get_info(pycurl.REDIRECT_TIME)  # 重定向所消耗的时间
        #print("重定向所消耗的时间:%s" % redirect_time)
        size_upload=c.get_info(pycurl.SIZE_UPLOAD)  # 上传数据包大小
        size_download=c.get_info(pycurl.SIZE_DOWNLOAD)  # 下载数据包大小
        speed_download=c.get_info(pycurl.SPEED_DOWNLOAD)  # 平均下载速度
        speed_upload=c.get_info(pycurl.SPEED_UPLOAD)  # 平均上传速度
        header_size=c.get_info(pycurl.HEADER_SIZE)  # HTTP头部大小
        
        print(c.body())
        print('=' * 74 + '\n')
        print(c.header())
        print('=' * 74 + '\n')
        import pprint
        pprint.pprint(c.info())
        print(c.get_info(pycurl.OS_ERRNO))
        print(c.info()['os-errno'])
        '''

    except Exception as e:
        str = "def pycurl_detection(%s,%s) Exception %s" % (url, ip, e.args)
        print(str)
        logging.exception(str)
        return -1, -1, 0, 0, 0
    finally:
        c.close()
    return http_code, "%.3f" % total_time, size_download, speed_download, redirect_count
Exemplo n.º 12
0
__author__ = 'zhangxa'

from curl import Curl
import pycurl

from html.parser import HTMLParser
from htmlParser.htmlParser import UrlHtmlParser
from download.downFile import DownFile
from urlHandler.urlHandler import UrlBaseHandler
from urlQueue.urlQueue import UrlQueue

start_url = "http://www.pcgames.com.cn/"
c = Curl()
c.set_url(start_url)
data = c.get()
info = c.info()
#print(info)

def get_charset(c_type):
    charset=None
    try:
        if c_type and 'charset' in c_type:
            start = c_type.find('charset=')
            charset_str = c_type[start:]
            end = charset_str.find(' ')
            if end > -1:
                charset = charset_str[len('charset='):end]
            else:
                charset = charset_str[len('charset='):]
    except:
        return 'UTF-8'