def run(self): while not self.workQueue.empty(): item = self.workQueue.get() if not isinstance(item, list): WARNING("[Download] error type, return!!") return try: url = item[0] except Exception: WARNING("[Download] url isn't exists, return !!") return try: download_type = item[1] except Exception: WARNING("[Download] error in download_type!!, return!!") return method = item[2] filename = item[3] if download_type == "picture": self.download_picture(image_url = url, filename = filename) elif download_type == "anything": self.download_anything(image_url = url, filename = filename) elif download_type == "custom": pass
def Putresult_Get(self, response): if not isinstance(response, list): WARNING("[Schedule] [result] the wrong params!!") return print len(response) print "mm" self.result_get.append(response)
def AddTodo_Get(self, object): if not isinstance(object, Request): WARNING("[Schedule] [GET]the wrong params!!") return obj = object item = [] if obj.url is None: WARNING("[Schedule] [GET] without url ??") return item.append(obj.url) item.append(obj.method) item.append(obj.headers) item.append(obj.callback) INFO("[Schedule] [GET] url = {}, method = {}, hearders = {}".format( obj.url, obj.method, obj.headers)) self.Works_get.append(item)
def AddTodo_Post(self, object): if not isinstance(object, object): WARNING("[Schedule] [POST] [POST]the wrong params!!") return obj = object item = [] if (obj.url is None or obj.request is None): WARNING("[Schedule] [POST] without url or request??") return item.append(obj.url) item.append(obj.method) item.append(obj.request) item.append(obj.headers) item.append(obj.func) INFO( "[Schedule] [POST] url = %s, method = %s, request = %s, headers = %s", obj.url, obj.method, obj.request, obj.headers) self.Works_post.append(item)
def PutToDownload(self, object): if not isinstance(object, Request): WARNING("[Schedule] input incorrect download params, stop!!") obj = object url = obj.url if url is None: WANRING("[Schedule] url is empty ?? stop schedule!!!") return else: INFO("[Schedule] The url to download is {}".format(url)) download_type = obj.download_type if download_type is None: WANRING("[Schedule] download_type is None, stop schedule") method = obj.method if method != "DOWNLOAD": WARNING( "[Schedule] method is not 'DOWNLOAD' , switch method to download" ) method == "DOWNLOAD" filename = obj.filename if filaname is None: INFO("[Schedule] there is not exists filename") else: INFO("[Schedule] [download] filename is {}".format(filename)) item = [] item.append(url) item.append(download_type) item.append(method) item.append(filename) self.download_list.append(item)
def _callback_func(self, lis): if not isinstance(lis, list): return response = lis[0] callback = lis[1] request_next = callback(response) if request_next is None: WARNING("[engine] [callback_func] nothing to call back") return self.request_buf.append(request_next) if schedule.Judge_empty_get(): while len(self.request_buf): tmp = self.request_buf.pop() self.engine_start(tmp)
def download_picture(self, image_url, path = "", filename = ""): if path is None: if Config_paser.locate is None: path = os.path.getcwd() else: path = Config_paser.locate INFO("[Download] the picture will be downloaded in {}".format(path)) if filename is not None: os.path.join(path, '{}.jpg'.format(filename)) else: os.path.join(path, '{}.jpg'.format(image_url)) try: image = requests.get(image_url, stream = True) with open(path, 'wb') as img: img.write(image.content) except Exception as e: WARNING(e)
def download_anything(self, down_url, path = "", filename = ""): if path is None: if Config_paser.locate is None: path = os.path.getcwd() else: path = Config_paser.locate INFO("[Download] [download_anthing] the file will be downloaded in {}".format(path)) if filename is not None: file = filename.split('.') if file[1] is None: filename += '.dat' else: filename = down_url + '.dat' os.path.join(path, filename) try: result = requests.get(down_url, stream = True) with open(path, 'wb') as things: things.write(result.content) except Exception as e: WARNING(e)
def run(self): while not self.workQueue.empty(): item_obj = self.workQueue.get() url = item_obj.url method = item_obj.method if item_obj.method else 'GET' cache = item_obj.cache headers = item_obj.headers or {} meta = item_obj.cookieJar or None cookie = None if not isinstance(meta, dict): WARNING('meta should be a dict') else: cookie = meta.get('cookieJar', None) try: formdata = item_obj.formdata if item_obj.formdata else None except: formdata = None if method == 'GET': response = response_obj() http_response, http_cookie = self.get_url(url, headers, cookie) response.cookie = http_cookie response.response_string = self.deal_response_with_xpath( http_response) cache.Response_Cache = response if method == 'POST': response = response_obj() http_response, http_cookie = self.get_url( url, formdata, headers, cookie) response.cookie = http_cookie response.response_string = self.deal_response_with_xpath( http_response) cache.Response_Cache = response
def Putresult_Post(self, response): if not isinstance(object, str): WARNING("[Schedule] [result] the wrong params!!") return self.result_post.append(response)
def __init__(self, *args, **kwargs): try: self.url = kwargs.pop('url') INFO("[request] url to Crawer = {}".format(self.url)) except Exception: self.url = None WARNING("[request] without url???, this request will be stopped!!") return try: self.method = kwargs.pop('method') except Exception: self.method = None WARNING("[request] without method auto set method to 'GET'") self.method = 'GET' try: formdata = kwargs.pop('formdata') if (self.method == "GET"): WARNING( "[request] ...there exists dict of formdata turn 'GET' to 'POST'" ) self.method = 'POST' except Exception: formdata = None if (self.method == 'POST'): WARNING( "[request]...formdata is empty auto turn 'POST' to 'GET'") self.method = 'GET' if formdata: items = formdata.iteritems() if isinstance(formdata, dict) else formdata self.formdata = [(str_to_unicode(k, encoding = 'utf-8'), str_to_unicode(v, encoding = 'utf-8')) \ for k,v in items] try: self.headers = kwargs.pop('headers') except Exception: self.headers = None INFO("[request] without header, set header to None") try: self.callback = kwargs.pop('callback') except Exception: self.callback = None INFO("[request] nothing to callback!!") try: self.filename = kwargs.pop('filename') except Exception: self.filename = None INFO("[request] without filename.......") try: self.download_type = kwargs.pop('download_type') except Exception: if self.method == "DOWNLOAD": WARNING( "[request] method is download,but there isn't exists and method??, stop it!!" ) return try: self.headers = kwargs.pop('headers') except Exception: self.headers = None try: self.meta = kwargs.pop('meta') except Exception: self.meta = None