예제 #1
0
class Crawl_Proxy(object):
    def __init__(self):
        self.source_url = CYBERSYNDROME
        self.mongo = MongodbAPI()

    def start(self):
        data = self.mongo.Get_Data_From("proxy", {'_id': 0})
        if data is not None and datetime.now() - timedelta(
                hours=3) < data["update_date"] and len(data["iptable"]) > 250:
            logging.info("Use old proxies")
            return
        logging.info("start crawl proxy")
        self.mongo.DropAll("proxy")
        proxy_ip = self.paresrHTML()
        self.mongo.Insert_Data_To("proxy", {
            "_id": 0,
            "iptable": proxy_ip,
            "update_date": datetime.now()
        })
        logging.info("add %04d ip" % (len(proxy_ip)))

    def paresrHTML(self):
        p = HtmlRequests()
        tree = p.get_html_noproxy(self.source_url)
        _as = []
        _ps = []
        if tree == None:
            return
        for i in tree.xpath('//div[@id="content"]/script/text()'):
            result = re.findall('\[[0-9 ,]*\]', i)
            _as = result[0].replace("[", '').replace("]", '')
            _ps = result[1].replace("[", '').replace("]", '')
            _as_list = [x for x in _as.split(',')]
            _ps_list = [x for x in _ps.split(',')]
            arithmetic = re.findall('\(.*?\)%\d*', i)
            n = self.decode(_ps_list, arithmetic[0])
            _as = _as_list[n:] + _as_list[0:n]
            break
        headerlist = []
        for i in tree.xpath('//tr'):
            headers = {}
            for j in i.xpath('td[6]/text()'):
                tmp = j.split(":")
                headers[tmp[0]] = tmp[1]
            headerlist.append(headers)
        return self.getproxy(_as, _ps_list, headerlist)

    def decode(self, ps, string):
        divisor = string.split(')')[1].replace('%', '')
        dividend = string.split(')')[0].replace('(', '')
        num = 0
        for i in dividend.split('+'):
            if "*" in i:
                mult = 1
                for k in i.split('*'):
                    if "ps" in k:
                        count = int(re.search('\d+', k).group(0))
                        mult *= int(ps[count])
                    else:
                        mult *= int(k)
                num += mult
            else:
                if "ps" in i:
                    count = int(re.search('\d+', i).group(0))
                    num += int(ps[count])
                else:
                    num += int(i)
        return num % int(divisor)

    def getproxy(self, _as, _ps, headerlist):
        proxy_ip = []
        j = 0
        ip = ""
        for i in range(len(_as)):
            if i % 4 == 3:
                ip += _as[i]
                proxy_ip.append({
                    'ip': {
                        'http': ip + ':' + _ps[j]
                    },
                    'headers': headerlist[j]
                })
                j += 1
                ip = ""
                continue
            ip += _as[i] + '.'
        return proxy_ip
예제 #2
0
class TWSE_realtime():
    def __init__(self, stock_num):
        self.mongo = MongodbAPI()
        self.stock_num = stock_num
        self.htmlreq = HtmlRequests()
        self.req = self.htmlreq.get_session(SESSIONURL)
        now = datetime.now()
        self.stop_date = datetime(now.year, now.month, now.day, 13, 30, 10)

    def start(self):
        self.crawl()

    def crawl(self):
        now = datetime.now()
        if now < self.stop_date:
            threading.Timer(5.0, self.crawl).start()
        now_time = int(time.time()) * 1000
        source_url = TWSEREALTIMEURL.format(
            stock_num=self.stock_num, time=now_time)
        json_data = self.htmlreq.get_json(self.req, source_url)
        data = self.parser(json_data)
        if data == None:
            return
        e = self.mongo.CheckExists('Realtime_data', data.get('_id', None))
        if e == False:
            for i in range(5):
                err = self.mongo.Insert_Data_To("Realtime_data", data)
                if err:
                    logging.info("Insert realtime data to mongo, id:%s" %
                                 (data.get("_id")))
                    return
            else:
                logging.error(
                    "Fail to insert realtime data to mongo, id:%s" % (data.get("_id")))

    def parser(self, j: json):
        # Process best result
        if len(j['msgArray']) == 0:
            return None
        data = j['msgArray'][0]

        def _split_best(d):
            if d:
                return d.strip('_').split('_')
            return d

        time = datetime.fromtimestamp(
            int(data['tlong']) / 1000).strftime('%Y-%m-%d %H:%M:%S')
        date = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
        return {
            "_id": str(self.stock_num) + "@"+time,
            "code": self.stock_num,
            'ts': int(time.mktime(date.timetuple())),
            "time": date,
            "latest_trade_price": float(data.get('z', None)),
            "trade_volume": float(data.get('tv', None)),
            "accumulate_trade_volume": float(data.get('v', None)),
            "best_bid_price": [float(x) for x in _split_best(data.get('b', None))],
            "best_bid_volume": [float(x) for x in _split_best(data.get('g', None))],
            "best_ask_price": [float(x) for x in _split_best(data.get('a', None))],
            "best_ask_volume": [float(x) for x in _split_best(data.get('f', None))],
            "open": float(data.get('o', None)),
            "high": float(data.get('h', None)),
            "low": float(data.get('l', None))
        }