示例#1
0
    def Get_result_Get(self):

        item = self.result_get.pop()
        item_list = copy.deepcopy(item)

        INFO("[Schedule] [result] get result !!")
        return item_list
示例#2
0
def start_parser():

    global num_threading
    global locate
    global headers

    parser = SafeConfigParser()
    parser.read("config.ini")
    num_threading = parser.get('system', 'num_threading')
    INFO("[Config_paser] num_threading = {}".format(num_threading))

    locate = parser.get('system', 'locate')
    INFO("[Config_paser] locate path = {}".format(locate))

    headers = parser.get('system', 'headers')
    INFO("[Config_paser] headers = {}".format(headers))
示例#3
0
    def engine_start(self, res=None):
        if res is None:
            request_object = spider.start_request()
        else:
            request_object = res

        while True:
            try:
                res_object = request_object.next()
                if (res_object.method == "GET"):
                    schedule.AddTodo_Get(res_object)
                    INFO("[engine_Manager] send http for get!!")

                elif (res_object.method == "POST"):
                    schedule.AddTodo_Post(res_object)
                    INFO("[engine_Manager] send http for post!!")

                elif (res_object.method == "DOWNLOAD"):
                    schedule.PutToDownload(res_object)
                    INFO("[engine_Manager] send download message!!")

            except StopIteration:
                INFO("[engine] [senToSchedule] generation is empty")
                break

        if res_object.method == "GET":
            self.thread.add_func_get()
            self.thread.start()
            self.thread.waitForallThreadcompelete()

        elif res_object.method == "POST":
            self.thread.add_func_post()
            self.thread.start()
            self.thread.waitForallThreadcompelete()

        elif res_object.method == "DOWNLOAD":
            self.download.GetTodown()
            self.download.start()
            self.download.waitForallThreadcompelete()

        self._GetfromSchedule()
示例#4
0
    def PutToDownload(self, object):
        if not isinstance(object, Request):
            WARNING("[Schedule] input incorrect download params, stop!!")
        obj = object

        url = obj.url
        if url is None:
            WANRING("[Schedule] url is empty ??   stop schedule!!!")
            return
        else:
            INFO("[Schedule] The url to download is {}".format(url))

        download_type = obj.download_type
        if download_type is None:
            WANRING("[Schedule] download_type is None, stop schedule")

        method = obj.method
        if method != "DOWNLOAD":
            WARNING(
                "[Schedule] method is not 'DOWNLOAD' , switch method to download"
            )
            method == "DOWNLOAD"

        filename = obj.filename
        if filaname is None:
            INFO("[Schedule] there is not exists filename")
        else:
            INFO("[Schedule] [download] filename is {}".format(filename))

        item = []

        item.append(url)
        item.append(download_type)
        item.append(method)
        item.append(filename)

        self.download_list.append(item)
示例#5
0
    def AddTodo_Get(self, object):
        if not isinstance(object, Request):
            WARNING("[Schedule] [GET]the wrong params!!")
            return
        obj = object
        item = []

        if obj.url is None:
            WARNING("[Schedule] [GET] without url ??")
            return

        item.append(obj.url)
        item.append(obj.method)
        item.append(obj.headers)
        item.append(obj.callback)
        INFO("[Schedule] [GET] url = {}, method = {}, hearders = {}".format(
            obj.url, obj.method, obj.headers))
        self.Works_get.append(item)
示例#6
0
    def run(self):
        while (not self.workQueue.empty()):
            res = self.workQueue.get(False)
            print res.qsize()

            while not res.empty():
                list = res.get()
                method = list[1]

                if (method == "GET"):
                    url = list[0]
                    method = list[1]
                    headers = list[2]
                    callback = list[3]

                    item = []

                    response = self.get(url=url,
                                        method=method,
                                        headers=headers).content
                    final_res = etree.HTML(response.lower().decode("utf-8"))
                    item.append(final_res)
                    item.append(callback)
                    schedule.Putresult_Get(item)

                elif (method == "POST"):
                    url = list[0]
                    method = list[1]
                    request = list[2]
                    headers = list[3]
                    callback = list[4]

                    item = []

                    response = self.post(url=url,
                                         method=method,
                                         request=request,
                                         headers=headers)
                    final_res = etree.HTML(response.lower.decode("utf-8"))
                    INFO('final_res = {}'.format(final_res))
                    item.append(final_res)
                    item.append(callback)
                    schedule.Putresult_Post(item)
示例#7
0
    def download_picture(self, image_url, path = "", filename = ""):
    	if path is None:
    		if Config_paser.locate is None:
                path = os.path.getcwd()
            else:
                path = Config_paser.locate

        INFO("[Download] the picture will be downloaded in {}".format(path))

        if filename is not None:
            os.path.join(path, '{}.jpg'.format(filename))
        else:
            os.path.join(path, '{}.jpg'.format(image_url))

    	try:
            image = requests.get(image_url, stream = True)
            with open(path, 'wb') as img:
                img.write(image.content)
        except Exception as e:
            WARNING(e)
示例#8
0
    def AddTodo_Post(self, object):
        if not isinstance(object, object):
            WARNING("[Schedule] [POST] [POST]the wrong params!!")
            return

        obj = object
        item = []

        if (obj.url is None or obj.request is None):
            WARNING("[Schedule] [POST] without url or request??")
            return

        item.append(obj.url)
        item.append(obj.method)
        item.append(obj.request)
        item.append(obj.headers)
        item.append(obj.func)
        INFO(
            "[Schedule] [POST] url = %s, method = %s, request = %s, headers = %s",
            obj.url, obj.method, obj.request, obj.headers)
        self.Works_post.append(item)
示例#9
0
    def download_anything(self, down_url, path = "", filename = ""):
        if path is None:
            if Config_paser.locate is None:
                path = os.path.getcwd()
            else:
                path = Config_paser.locate

        INFO("[Download] [download_anthing] the file will be downloaded in {}".format(path))

        if filename is not None:
            file = filename.split('.')
            if file[1] is None:
                filename += '.dat'
        else:
            filename = down_url + '.dat'

        os.path.join(path, filename)    

        try:
            result = requests.get(down_url, stream = True)
            with open(path, 'wb') as things:
                things.write(result.content)
        except Exception as e:
            WARNING(e)
示例#10
0
    def Get_result_Post(self):

        item_list = copy.deepcopy(self.result_post)
        INFO("[Schedule] [result] get post result !!")
        return item_list
示例#11
0
    def __init__(self, *args, **kwargs):

        try:
            self.url = kwargs.pop('url')
            INFO("[request] url to Crawer = {}".format(self.url))
        except Exception:
            self.url = None
            WARNING("[request] without url???, this request will be stopped!!")
            return

        try:
            self.method = kwargs.pop('method')
        except Exception:
            self.method = None
            WARNING("[request] without method auto set method to 'GET'")
            self.method = 'GET'

        try:
            formdata = kwargs.pop('formdata')
            if (self.method == "GET"):
                WARNING(
                    "[request] ...there exists dict of formdata turn 'GET' to 'POST'"
                )
                self.method = 'POST'
        except Exception:
            formdata = None
            if (self.method == 'POST'):
                WARNING(
                    "[request]...formdata is empty auto turn 'POST' to 'GET'")
                self.method = 'GET'

        if formdata:
            items = formdata.iteritems() if isinstance(formdata,
                                                       dict) else formdata
            self.formdata = [(str_to_unicode(k, encoding = 'utf-8'), str_to_unicode(v, encoding = 'utf-8')) \
                    for k,v in items]

        try:
            self.headers = kwargs.pop('headers')
        except Exception:
            self.headers = None
            INFO("[request] without header, set header to None")

        try:
            self.callback = kwargs.pop('callback')
        except Exception:
            self.callback = None
            INFO("[request] nothing to callback!!")

        try:
            self.filename = kwargs.pop('filename')
        except Exception:
            self.filename = None
            INFO("[request] without filename.......")

        try:
            self.download_type = kwargs.pop('download_type')
        except Exception:
            if self.method == "DOWNLOAD":
                WARNING(
                    "[request] method is download,but there isn't exists and method??, stop it!!"
                )
                return

        try:
            self.headers = kwargs.pop('headers')
        except Exception:
            self.headers = None

        try:
            self.meta = kwargs.pop('meta')
        except Exception:
            self.meta = None