def _preprocess(self, image):
        if requests('gamma_correct', self._preprocesses):
            image = gamma_to_intensity(image, **self._preprocesses)

        image = hist_adjust(image, **self._preprocesses)

        if requests('resize_font', self._preprocesses):
            if 'height' in self._preprocesses:
                image = resize_font(image, **self._preprocesses)
                #As the deadline comes closer, the entropy of this code increases exponentially
                if 'new_height' in self._preprocesses:
                    new_height = self._preprocesses['new_height']
                else:
                    new_height = 32
                self.scale_factor = new_height / self._preprocesses['height']
            else:
                raise Error('Tried to calculate height')
                image = resize_font(image,
                                    height=image.shape[0],
                                    **self._preprocesses)
        else:
            self.scale_factor = 1.0

        if requests('add_padding', self._preprocesses):
            if 'padding' in self._preprocesses:
                pad = self._preprocesses['padding']
            else:
                pad = 2

            edges = np.concatenate([image[:2], image[-2:]], axis=0)
            if edges[edges < edges.mean() +
                     0.001].size > edges[edges > edges.mean() - 0.001].size:
                pad_color = edges[edges < edges.mean() + 0.001].mean()
            else:
                pad_color = edges[edges > edges.mean() - 0.001].mean()

            image = np.pad(image, ((pad, pad), (0, 0)),
                           mode='constant',
                           constant_values=pad_color)

        if requests('gamma_correct', self._preprocesses):
            image = gamma_to_rgb(image, **self._preprocesses)

        return image
예제 #2
0
 def get_all_books(self, source="ZhangBook", limit=1, **kwargs):
     spider = Cluster.get_spider()
     params = {
         "host": spider['host'],
         "port": spider['port'],
         "source": source,
         "path": "get_all_books"
     }
     path = get_path(**params)
     data = kwargs
     data["limit"] = limit
     return requests(path, data=json.dumps(data))
예제 #3
0
 def get_chapter_content(self, book_id, number_id, source="ZhangBook", **kwargs):
     spider = Cluster.get_spider()
     params = {
         "host": spider['host'],
         "port": spider['port'],
         "source": source,
         "path": "get_chapter_content"
     }
     path = get_path(**params)
     data = kwargs
     data["book_id"] = book_id
     data["number_id"] = number_id
     return requests(path, data=data)
예제 #4
0
 def get_chapters(self, book_id, start=1, source="ZhangBook", **kwargs):
     spider = Cluster.get_spider()
     params = {
         "host": spider['host'],
         "port": spider['port'],
         "source": source,
         "path": "get_chapters"
     }
     path = get_path(**params)
     data = kwargs
     data["book_id"] = book_id
     data["start"] = start
     return requests(path, data=data)
예제 #5
0
 def get_book_detail(self, book_id, source="ZhangBook", cache=False, **kwargs):
     spider = Cluster.get_spider()
     params = {
         "host": spider['host'],
         "port": spider['port'],
         "source": source,
         "path": "get_book_detail"
     }
     path = get_path(**params)
     data = kwargs
     data["book_id"] = book_id
     data["cache"] = cache
     return requests(path, data=data)
예제 #6
0
 def crawl_zhandaye():
     """
     站大爷代理:http://ip.zdaye.com/dayProxy.html
     """
     url = 'http://ip.zdaye.com/dayProxy.html'
     html = requests(url)
     sttrs = re.findall('<H3 class="title"><a href="(.*?)">', html, re.S)
     for sttr in sttrs:
         new_url = url[:28] + sttr[9:]
         new_html = requests_other(new_url)
         get_div = re.search("<div class=\"cont\">(.*?)</div>", new_html, re.S).group(1)
         print(get_div)
         results = re.findall("<br>(.*?)@(.*?)#\[(.*?)\]", get_div, re.S)
         for result in results:
             yield "{}://{}".format(result[1].lower(), result[0])
예제 #7
0
    def crawl_66ip():
        """
        66ip 代理:http://www.66ip.cn
        19-04-30可用
        """
        url = (
            "http://www.66ip.cn/nmtq.php?getnum=100&isp=0"
            "&anonymoustype=0&area=0&proxytype={}&api=66ip"
        )
        pattern = "\d+\.\d+.\d+\.\d+:\d+"

        items = [(0, "http://{}"), (1, "https://{}")]
        for item in items:
            proxy_type, host = item
            html = requests(url.format(proxy_type))
            if html:
                for proxy in re.findall(pattern, html):
                    yield host.format(proxy)
예제 #8
0
    def crawl_swei360():
        """
        360 代理:http://www.swei360.com
        过期
        """
        url = "http://www.swei360.com/free/?stype={}"

        items = [p for p in range(1, 5)]
        for proxy_type in items:
            html = requests(url.format(proxy_type))
            if html:
                doc = pyquery.PyQuery(html)
                for item in doc(".table-bordered tr").items():
                    ip = item("td:nth-child(1)").text()
                    port = item("td:nth-child(2)").text()
                    schema = item("td:nth-child(4)").text()
                    if ip and port and schema:
                        yield "{}://{}:{}".format(schema.lower(), ip, port)
예제 #9
0
    def crawl_data5u():
        """
        无忧代理:http://www.data5u.com/
        每次14个,验证时间比较新
        19-04-30可用
        """
        url = "http://www.data5u.com/free/index.html"

        html = requests(url)
        if html:
            doc = pyquery.PyQuery(html)
            for index, item in enumerate(doc(".wlist li .l2").items()):
                if index > 0:
                    ip = item("span:nth-child(1)").text()
                    port = item("span:nth-child(2)").text()
                    schema = item("span:nth-child(4)").text()
                    if ip and port and schema:
                        yield "{}://{}:{}".format(schema, ip, port)
예제 #10
0
    def crawl_kuaidaili():
        """
        快代理:https://www.kuaidaili.com
        每次30个
        19-04-13可用
        """
        url = "https://www.kuaidaili.com/free/inha/{}/"

        items = [p for p in range(1, 3)]
        for page in items:
            html = requests(url.format(page))
            if html:
                doc = pyquery.PyQuery(html)
                for proxy in doc(".table-bordered tr").items():
                    ip = proxy("[data-title=IP]").text()
                    port = proxy("[data-title=PORT]").text()
                    if ip and port:
                        yield "http://{}:{}".format(ip, port)
예제 #11
0
    def crawl_iphai():
        """
        ip 海代理:http://www.iphai.com
        爬取国内高匿、国外高匿、国外普通各10个
        19-04-30可用
        """
        url = "http://www.iphai.com/free/{}"

        items = ["ng", "np", "wg", "wp"]
        for proxy_type in items:
            html = requests(url.format(proxy_type))
            if html:
                doc = pyquery.PyQuery(html)
                for item in doc(".table-bordered tr").items():
                    ip = item("td:nth-child(1)").text()
                    port = item("td:nth-child(2)").text()
                    schema = item("td:nth-child(4)").text().split(",")[0]
                    if ip and port and schema:
                        yield "{}://{}:{}".format(schema.lower(), ip, port)
예제 #12
0
    def crawl_ip3366():
        """
        云代理:http://www.ip3366.net
        每页10个,验证较快
        19-04-30可用
        """
        url = "http://www.ip3366.net/?stype=1&page={}"

        items = [p for p in range(1, 8)]
        for page in items:
            html = requests(url.format(page))
            if html:
                doc = pyquery.PyQuery(html)
                for proxy in doc(".table-bordered tr").items():
                    ip = proxy("td:nth-child(1)").text()
                    port = proxy("td:nth-child(2)").text()
                    schema = proxy("td:nth-child(4)").text()
                    if ip and port and schema:
                        yield "{}://{}:{}".format(schema.lower(), ip, port)
예제 #13
0
    def crawl_xici():
        """
        西刺代理:http://www.xicidaili.com
        """
        url = "http://www.xicidaili.com/{}"

        items = []
        for page in range(1, 21):
            items.append(("wt/{}".format(page), "http://{}:{}"))
            items.append(("wn/{}".format(page), "https://{}:{}"))

        for item in items:
            proxy_type, host = item
            html = requests(url.format(proxy_type))
            if html:
                doc = pyquery.PyQuery(html)
                for proxy in doc("table tr").items():
                    ip = proxy("td:nth-child(2)").text()
                    port = proxy("td:nth-child(3)").text()
                    if ip and port:
                        yield host.format(ip, port)