Exemplo n.º 1
0
 def crawling(self):
     curl = Curl()
     curl.set_url(self.url)
     with open(self.filname,"wb") as output:
         curl.set_option(pycurl.WRITEFUNCTION,output.write)
         curl.get()
         curl.close()
Exemplo n.º 2
0
 def crawling(self):
     curl = Curl()
     curl.set_url(self.url)
     with open(self.filname, "wb") as output:
         curl.set_option(pycurl.WRITEFUNCTION, output.write)
         curl.get()
         curl.close()
Exemplo n.º 3
0
 def saveFile2Local(self,url):
     self.getFileNameByUrl(url)
     if self.filename:
         with open(self.filename,"wb") as output:
             curl = Curl()
             curl.set_url(url)
             curl.set_option(pycurl.WRITEFUNCTION,output.write)
             curl.get()
             curl.close()
             Log4Spider.downLog(self,"downloaded a file:[[[",self.filename,"]]]")
Exemplo n.º 4
0
            if self.infos == 5 and 'beliked' not in self.info.keys():
                self.info['beliked'] = int(data)

    def handle_endtag(self, tag):
        if tag == "h3":
            self.h3 = 0
        if self.clearfix and tag == "ul":
            self.clearfix = 0
            if hasattr(self, "infoHook"):
                self.infoHook(self.info)

    def handle_startendtag(self, tag, attrs):
        pass

    @property
    def urlList(self):
        return self.current_urlList()


if __name__ == "__main__":
    parser = JianShuUserInfo_HtmlParser()
    from curl import Curl
    import pycurl
    c = Curl()
    c.set_url("http://www.jianshu.com/users/d9edcb44e2f2/latest_articles")
    data = c.get()
    #parser.setParseFile("parse.txt")
    parser.setInfoHook(lambda info: print(str(info)))
    parser.feed(data.decode("utf-8"))
    parser.close()
    c.close()
Exemplo n.º 5
0
__author__ = 'zhangxa'

from curl import Curl
import pycurl

from html.parser import HTMLParser
from htmlParser.htmlParser import UrlHtmlParser
from download.downFile import DownFile
from urlHandler.urlHandler import UrlBaseHandler
from urlQueue.urlQueue import UrlQueue

start_url = "http://www.pcgames.com.cn/"
c = Curl()
c.set_url(start_url)
data = c.get()
info = c.info()
#print(info)


def get_charset(c_type):
    charset = None
    try:
        if c_type and 'charset' in c_type:
            start = c_type.find('charset=')
            charset_str = c_type[start:]
            end = charset_str.find(' ')
            if end > -1:
                charset = charset_str[len('charset='):end]
            else:
                charset = charset_str[len('charset='):]
    except:
Exemplo n.º 6
0
__author__ = 'zhangxa'

from curl import Curl
import pycurl

from html.parser import HTMLParser
from htmlParser.htmlParser import UrlHtmlParser
from download.downFile import DownFile
from urlHandler.urlHandler import UrlBaseHandler
from urlQueue.urlQueue import UrlQueue

start_url = "http://www.pcgames.com.cn/"
c = Curl()
c.set_url(start_url)
data = c.get()
info = c.info()
#print(info)

def get_charset(c_type):
    charset=None
    try:
        if c_type and 'charset' in c_type:
            start = c_type.find('charset=')
            charset_str = c_type[start:]
            end = charset_str.find(' ')
            if end > -1:
                charset = charset_str[len('charset='):end]
            else:
                charset = charset_str[len('charset='):]
    except:
        return 'UTF-8'