Пример #1
0
 def __init__(self):
     self.filehandle = FileHandler()
     self.request_handle = Request()
     self.calculate = Calculation()
     self.url = None 
     self.range_left = None
     self.range_right = None
     self.proxy = None 
     self.temp_dir = None 
     self.threads = None 
     self.filepath = None 
     logging.getLogger("urllib3").setLevel(logging.WARNING)
Пример #2
0
 def __init__(self, start_url, subdomains):
     self.start_req = Request('get', start_url, '/')
     self.scheduler = Scheduler(subdomains)
     self.spider = Spider()
     self.downloader = Downloader()
     # 加入初始请求
     self.scheduler.put_request(self.start_req)
Пример #3
0
 async def get_request(self):
     while True:
         data = await self.client.rpop('EngineQueue')
         if data:
             req = json.loads(data)
             return Request(method=req['method'],
                            target=req['target'],
                            path=req['path'],
                            headers=req['headers'],
                            body=req['body'])
     return None
Пример #4
0
def logLocationRequest():
    apiKey = returnAPIKey("ip_stack")
    ipStackEndpoint = "http://api.ipstack.com/81.180.208.125?access_key=" + str(
        apiKey)
    ipStackEndPointToLog = "http://api.ipstack.com/81.180.208.125?access_key="

    detailedResponse = requests.get(ipStackEndpoint)
    request = Request("IP_Stack", ipStackEndPointToLog,
                      detailedResponse.encoding, detailedResponse.status_code,
                      detailedResponse.elapsed.total_seconds())

    writeJsonToFile(request)
Пример #5
0
def before_request():
    g.headers = {}
    g.pagination = Pager(request.args)
    g.request = Request(request)
    g.auth = None
    g.perms = {}

    g.im_rds = rds

    cnf = MYSQL
    g._db = Mysql(*cnf)
    g._imdb = g._db
    def run(self):
        size = 1024
        # receive {"url":"", "range-left":"", "range-right":""} from client
        msg = self.client_conn.recv(size)
        if msg:
            msg = msg.decode()
            print("[+] Received Message: {}".format(msg))
            msg = json.loads(msg)

            # generate a random name for file
            filename = Calculation().generate_random_string(12)
            filepath = self.temp_dir + filename

            # use request to download
            url = msg['url']
            range_left = msg['range-left']
            range_right = msg['range-right']
            response = Request().make_request(url, self.proxy)

            # use Multiprocess to download using multithreading
            print("starting new process to download {}".format(filename))
            process = multiprocessing.Process(
                target=MultithreadedDownloader().download,
                args=(
                    url,
                    range_left,
                    range_right,
                    filepath,
                    self.temp_dir,
                    response,
                    self.threads,
                    self.proxy,
                )
            )
            process.start()
            process.join()
            print('Out of process for file {}'.format(filename))

            # send the downloaded file part to peer-client
            self.send_file_part(filepath)

            # let peer-client know that file sending is done
            self.client_conn.shutdown(socket.SHUT_RDWR)

            # close connection with peer-client
            self.client_conn.close()
            print("[-] Client Disconnected: {}".format(self.client_addr))

            # delete temp file
            FileHandler().delete_file(filepath)
            print("[-] Temp File Deleted.")
Пример #7
0
def logGetCityBasedOnCoordinates():
    apiKey = returnAPIKey("google_geo")
    latitude = str(getLocation()["latitude"])
    longitude = str(getLocation()["longitude"])
    googleMapsEndPoint = "https://maps.googleapis.com/maps/api/geocode/json?latlng=" + latitude + "," + longitude + \
                         "&key=" + str(apiKey)

    googleMapsEndPointToLog = "https://maps.googleapis.com/maps/api/geocode/json?latlng=" + latitude + "," + longitude + \
                              "&key="

    detailedResponse = requests.get(googleMapsEndPoint)
    request = Request("Google_GEO", googleMapsEndPointToLog,
                      detailedResponse.encoding, detailedResponse.status_code,
                      detailedResponse.elapsed.total_seconds())

    writeJsonToFile(request)
Пример #8
0
def logGetWeatherBasedOnCity():
    apiKey = returnAPIKey("open_weather")
    plusCode = str(getCityBasedOnCoordinates()["plus_code"])
    city = plusCode.split().__getitem__(2).replace(",", "")

    openWeatherEndPoint = "https://samples.openweathermap.org/data/2.5/weather?q=" + str(
        city) + "&appid=" + str(apiKey)
    openWeatherEndPointToLog = "https://samples.openweathermap.org/data/2.5/weather?q=" + str(
        city)

    detailedResponse = requests.get(openWeatherEndPoint)

    request = Request("OpenWeather", openWeatherEndPointToLog,
                      detailedResponse.encoding, detailedResponse.status_code,
                      detailedResponse.elapsed.total_seconds())

    writeJsonToFile(request)
Пример #9
0
    def login(self) -> bool:
        if self._is_logged():
            return True

        data = {
            "grant_type": "password",
            "username": self._credentials.username,
            "password": self._credentials.password
        }
        response = Request.run(self._token_url, "POST", data,
                               self._header(False), self._auth())
        if response['statusCode'] == 200:
            self._token = response['body']
            self._token['expires_date'] = datetime.fromtimestamp(
                time.mktime(datetime.now().timetuple())) + timedelta(
                    seconds=int(self._token['expires_in']))
            return True
        return False
Пример #10
0
def main(opt: Options):
    conn = opt.get_conn()
    num = opt.get_number()
    vehicle_data = {
        "vehicle": True,
        "vehicle_name": "",
        "end": True,
    }

    if START_MODE == "by-name":
        conn.set_kanban(SERVICE_NAME, num)
    elif START_MODE == "from-kanban":
        kanban = conn.get_one_kanban(SERVICE_NAME, num)
        metadata = kanban.get_metadata()
        vehicle_data = metadata['args']

    lprint(f"vehicle_data: {vehicle_data}")
    templates = []
    vehicle = vehicle_data['vehicle']
    vehicle_name = vehicle_data['vehicle_name']
    end = vehicle_data['end']

    try:
        ms = MySQLClient()
        if vehicle and not vehicle_name:
            templates += ms.get_all_vehicles()
            lprint("set_template all")
        elif vehicle and vehicle_name:
            templates += ms.get_by_vehicle_name(vehicle_name)
            lprint(f"set_template {vehicle_name}")

        if end:
            templates += ms.get_end()
            lprint("set_template end")

        if templates:
            with Request() as r:
                r.set_templates(templates)

    except Exception as e:
        print(str(e))
Пример #11
0
class Crawler():

    HOST = "www.nbiquge.com"
    SCHEMA = "https://"

    LIST_URL_KEY = "nbiquge_list_url"
    CHAPTER_URL_KEY = "nbiquge_chapter_url"
    # 记录章节顺序
    INCR_KEY = "nbiquge_incr_key"

    def __init__(self, start_url="https://www.nbiquge.com/7_7295/"):
        logging.info("crawler init...")
        self.start_url = start_url
        self.request = Request()
        redisClient.lpush(self.LIST_URL_KEY, start_url)

    def run(self):
        self.consume_list()

    def consume_list(self):
        '''
        串行取数据 每次从redis中取出一个list 抓取相应章节后
        再抓取下一页的链接 如果取不到新的链接 则程序结束
        '''
        list_url = redisClient.rpop(self.LIST_URL_KEY)
        if list_url is None:
            return
        logging.info("get url %s", list_url)
        self.parse_list(list_url)

    def parse_list(self, url):
        logging.info("parse_list get url: %s" % url.decode("utf-8"))
        rsp = self.request.get(url.decode("utf-8"))
        rsp.encoding = "gbk"  # 指定rsp的编码方式(否则解码后会有乱码)
        root = fromstring(rsp.text)
        # import pdb; pdb.set_trace()
        list_div = root.xpath('//div[@id="list"]')[0]
        chapters = list_div.xpath("//dd//a")
        logging.info("get %s chapter" % len(chapters))
        for chapter in chapters:
            chapter_id = redisClient.incr(self.INCR_KEY)
            task = {
                "href": self.SCHEMA + self.HOST + chapter.xpath(".//@href")[0],
                "name": chapter.xpath("string(.)"),
                "id": chapter_id
            }
            redisClient.lpush(self.CHAPTER_URL_KEY, umsgpack.packb(task))
            if chapter_id % 10 == 0:
                '''
                每10章
                '''
                logging.info("now chapter_id: %s" % chapter_id)
                self.consume_chapter()

    def consume_chapter(self):
        while True:
            #import pdb; pdb.set_trace()
            task = redisClient.lpop(self.CHAPTER_URL_KEY)
            if task is None:
                return
            self.consume_single_chapter(task)

    def consume_single_chapter(self, task):
        logging.info("consume_single_chapter get a task")
        task = umsgpack.unpackb(task)
        rsp = self.request.get(task["href"])
        rsp.encoding = "gbk"
        root = fromstring(rsp.text)
        content_div = root.xpath('//div[@id="content"]')[0]

        content = tostring(content_div,
                           method="html",
                           pretty_print=True,
                           encoding="utf-8")
        content = content.decode("utf-8")
        content = content.replace("<br>", "")
        content = content.replace("\xa0", "")
        content = content.replace('<div id="content">', '')
        content = content.replace('</div>', "")
        # import pdb; pdb.set_trace()
        sqlSession = sqlalchemyConn.DBSession()
        chapter = Chapter(chapter_id=task["id"],
                          title=task["name"],
                          content=content,
                          book_id=book_id,
                          site=self.HOST)
        sqlSession.add(chapter)
        sqlSession.commit()

    @classmethod
    def restart(self):
        '''
        清除数据
        '''
        redisClient.delete(self.LIST_URL_KEY)
        redisClient.delete(self.CHAPTER_URL_KEY)
        redisClient.delete(self.INCR_KEY)
Пример #12
0
async def worker_job(client_socket: socket, worker_name: str):
    if Config.log_worker_verbose:
        logging.debug(f'WORKER_{worker_name}: spawned')

    # GET REQUEST

    loop = asyncio.get_event_loop()
    request_raw = ""
    while True:
        request_part = (await loop.sock_recv(client_socket,
                                             Config.bytes_per_recv)).decode()
        request_raw += request_part
        if '\r\n' in request_raw or len(request_part) == 0:
            break

    request = Request(request_raw)

    # GET FILENAME

    filepath: str
    search_folder = request.url.endswith('/')
    if search_folder:
        filepath = Config.base_dir + request.url + Config.index_filename
    else:
        filepath = Config.base_dir + request.url
    file_exists = os.path.exists(filepath)

    # CREATE RESPONSE

    response: Response
    if request.method not in ['GET', 'HEAD']:
        response = Response(method=request.method,
                            protocol=request.protocol,
                            status=405)
    elif '/..' in request.url or (search_folder and not file_exists):
        response = Response(method=request.method,
                            protocol=request.protocol,
                            status=403)
    elif (not file_exists) or (not request.is_valid):
        response = Response(method=request.method,
                            protocol=request.protocol,
                            status=404)
    else:
        response = Response(method=request.method,
                            protocol=request.protocol,
                            status=200,
                            filepath=filepath)

    logging.info(
        f'WORKER_{worker_name}: {response.status} {request.method} {request.url}'
    )

    # SEND RESPONSE

    await response.send(client_socket)

    # END WORKER

    client_socket.close()

    if Config.log_worker_verbose:
        logging.debug(f'WORKER_{worker_name}: closed client socket')

    if Config.log_worker_verbose:
        logging.debug(f'WORKER_{worker_name}: done')
Пример #13
0
def before_request():
    g.headers = {}
    g.pagination = Pager(request.args)
    g.request = Request(request)
    g.auth = None
    g.perms = {}
Пример #14
0
        # check if download url supplied
        if (len(sys.argv) < 2):
            print("No Download URL! Exiting ...")
            sys.exit(0)
        url = sys.argv[1]
        client = ThreadedPeerClient(url)
        # port used by peer-client to communicate with tracker
        client_tracker_bind_port = peer_client_config.client_tracker_bind_port

        # fetch the list of active servers
        client.fetch_peers_list(tracker_server_address,
                                client_tracker_bind_port)

        # make request to url to get information about file
        req = Request()
        response = req.make_request(url, proxy=proxy)
        req.close_connection(response)

        # get the filesize
        filesize = int(response.headers['Content-Length'])
        filename = os.path.basename(url.replace("%20", "_"))
        filepath = download_dir + '/' + filename

        # if range-download is not supported, use simple download
        if response.headers['Accept-Ranges'] != 'bytes':
            print(
                "URL doesn't support range download! Using default download..."
            )
            MultithreadedDownloader().download(url, 0, filesize - 1, filepath,
                                               temp_dir, response, threads,
Пример #15
0
 def _request(self,
              url: str,
              data: dict = None,
              method: str = "GET") -> Dict[str, Any]:
     return Request.run(url, method, data, self._header(), self._auth())
Пример #16
0
 async def test(self):
     # 插入初始数据
     await self.client.lpush(
         'DownloaderQueue',
         pickle.dumps(Request('get', 'https://www.baidu.com', '/')))
Пример #17
0
class MultithreadedDownloader:
    """Main class providing interface of the software"""
    def __init__(self):
        self.filehandle = FileHandler()
        self.request_handle = Request()
        self.calculate = Calculation()
        self.url = None 
        self.range_left = None
        self.range_right = None
        self.proxy = None 
        self.temp_dir = None 
        self.threads = None 
        self.filepath = None 
        logging.getLogger("urllib3").setLevel(logging.WARNING)

    def range_download_support(self, resp):
        """ returns boolean value indicating support for range downloading """
        try:
            supported = (resp.headers['Accept-Ranges'] == 'bytes')
        except KeyError:
            supported = False

        return supported

    def multithreaded_download(self, ranges_list):
        """ function to perform multithreaded download """
        # downloading each segment
        for f in range(self.threads):
            # calling Downloader.download_range() for each thread
            t = threading.Thread(target=self.request_handle.download_range,
                kwargs={
                'url': self.url,
                'filepath': self.temp_dir + "/temp" + str(f), 
                'range_left': ranges_list[f][0],
                'range_right': ranges_list[f][1],
                'proxy': self.proxy
                })
            t.setDaemon(True)
            t.start()   

        # except main_thread, calling join() for each thread
        # it ensures that merging of parts occur only after each thread has completed downloading
        main_thread = threading.current_thread()
        for t in threading.enumerate():
            if t is main_thread:
                continue
            t.join()    

    def merge_multithreaded_download_parts(self):
        """ function to perform merging of parts performed by multiple threads on single system """
        # merging parts
        with open(self.filepath,'wb') as wfd:
            for f in range(self.threads):
                tempfilepath = self.temp_dir + "/temp" + str(f)
                with open(tempfilepath, "rb") as fd:
                    shutil.copyfileobj(fd, wfd)     
                # delete copied segment
                self.filehandle.delete_file(tempfilepath)

    def download(self, url, range_left, range_right, filepath, 
                 temp_dir, response, threads, proxy=None):
        """ function to perform file download """

        self.url = url
        self.range_right = range_right
        self.range_left = range_left
        self.filepath = filepath        
        self.temp_dir = temp_dir
        self.threads = threads
        self.proxy = proxy

        # if server supports segmented download
        if self.range_download_support(response):
            # get ranges for download for each thread
            ranges_list = self.calculate.get_download_ranges_list(self.range_left, 
                                                            self.range_right,
                                                            self.threads)
            # perform multithreaded download on single system
            self.multithreaded_download(ranges_list)
            # merge multithreaded download parts
            self.merge_multithreaded_download_parts()
        else:   
            print('''Server doesn't support multithreaded downloads!
                Download will be performed using single thread, on master system.''')   
            self.request_handle.download_range(self.url,
                                        self.filepath,
                                        self.range_left, 
                                        self.range_right,
                                        self.proxy)
Пример #18
0
 def __init__(self, start_url="https://www.nbiquge.com/7_7295/"):
     logging.info("crawler init...")
     self.start_url = start_url
     self.request = Request()
     redisClient.lpush(self.LIST_URL_KEY, start_url)
Пример #19
0
        norm_uri = ''.join([req.target, path_split[0]])
        query = ''
        if len(path_split) > 1:
            query = path_split[1]
        # 请求参数规范化
        new_query = []
        for k, v in map(self.split_query, query.split('&')):
            if self.is_data(k, v) and not self.is_action(k, v):
                new_query.append('='.join([k, '[data]']))
            else:
                new_query.append('='.join([k, v]))
        new_query = '&'.join(new_query)
        return '?'.join([norm_uri, new_query])

    def put_request(self, req):
        now_req_hash = self.get_request_hash(req)
        if now_req_hash not in self.req_hash and self.in_subdomains(
                req.target) and not self.is_static(req.path):
            self.req_queue.put(req)
        self.req_hash.add(now_req_hash)

    def get_request(self):
        return self.req_queue.get()


if __name__ == "__main__":
    req = Request('get', 'http://s:80', '/')
    scheduler = Scheduler()
    a = scheduler.get_request_hash(req)
    print(a)