def __init__(self, stock_num): self.mongo = MongodbAPI() self.stock_num = stock_num self.htmlreq = HtmlRequests() self.req = self.htmlreq.get_session(SESSIONURL) now = datetime.now() self.stop_date = datetime(now.year, now.month, now.day, 13, 30, 10)
def __init__(self, stock_num: str, start_year: int, start_month: int): self.mongo = MongodbAPI() self.stock_num = stock_num self.htmlreq = HtmlRequests() self.req = self.htmlreq.get_session(SESSIONURL) self.req.keep_alive = False self.start_year = start_year self.start_month = start_month self.now_date = datetime.now() self.retry = 0
class Money_link(): def __init__(self): self.mongo = MongodbAPI() def start(self, stock_num: str) -> list: source_url = MONEYLINKURL % (stock_num) return self.parser(stock_num, source_url) def parser(self, stock_num, url) -> list: daily = [] htmlparser = HtmlRequests() tree = htmlparser.get_html(url) if tree == None: return daily now = datetime.now() for i in tree.xpath('//div[@id="TickHeight"]/table/tr'): time = i.xpath('td[1]/text()')[0] buying = i.xpath('td[2]/text()')[0] selling = i.xpath('td[3]/text()')[0] if buying == '--' or selling == '--': continue transaction = i.xpath('td[4]/text()')[0] tmp_ups_and_downs = i.xpath('td[5]/text()')[0].split(" ") ups_and_downs = "" if len(tmp_ups_and_downs) < 2: ups_and_downs = "0.0" elif tmp_ups_and_downs[0] == "▼": ups_and_downs = "-" + tmp_ups_and_downs[1] elif tmp_ups_and_downs[0] == "▲": ups_and_downs = tmp_ups_and_downs[1] stock_volume = i.xpath('td[6]/text()')[0] time_tmp = time.split(':') date = datetime(now.year, now.month, now.day, int(time_tmp[0]), int(time_tmp[1]), int(time_tmp[2])) if self.mongo.CheckExists("Transaction_details", str(stock_num) + "@" + date.isoformat()): continue daily.append({ '_id': stock_num + "@" + date.isoformat(), 'ts': int(date.timestamp()), 'stock': stock_num, 'date': date, 'buying': float(buying), 'selling': float(selling), 'transaction': float(transaction), 'ups_and_downs': float(ups_and_downs), 'stock_volume': int(stock_volume) }) return daily
def main(): stocks = [ '3455', '5443', '8064', '2409', '1504', '3535', '2397', '2316', '2392', '2888', '2385', '2337', '3406', '2492', '2478', '6182', '8163', '2337', '2481', '3016', '6153', '3630', '4190' ] m = MongodbAPI() cp = Crawl_Proxy() cp.start() # opts, args = [], [] # try: # opts, args = getopt.getopt(sys.argv[1:], 'rDdipe', []) # except getopt.GetoptError as err: # logging.error(err) # sys.exit(2) # for opt, args in opts: # if opt in ("-r"): # # Realtime Parser End # logging.info("start realtime parser") # threads = [] # thread_num = len(stocks) # for i in range(thread_num): # threads.append(threading.Thread( # target=twse_realtime, args=(stocks[i],))) # threads[i].start() # for i in range(thread_num): # threads[i].join() # logging.info("Thread Done") # # Realtime Parser End # elif opt == "-D": # logging.info("start daily parser") # threads = [] # thread_num = len(stocks) # for i in range(thread_num): # threads.append(threading.Thread( # target=twse_daily, args=(stocks[i], 2007, 1))) # threads[i].start() # for i in range(thread_num): # threads[i].join() # logging.info("Thread Done") # elif opt == "-d": # # elif opt == "-i": # date_list = create_random_date() # threads = [] # thread_num = 16 # for i in range(thread_num): # threads.append(threading.Thread( # target=tse_institutional_investors, args=(date_list,))) # threads[i].start() # for i in range(thread_num): # threads[i].join() # logging.info("Thread Done") # elif '-p' in opt and '-e' in opt: # else: # close_proxy() cli_parser = _build_parser() args = cli_parser.parse_args() if args.d: # 每日交易明細 logging.info("start daily detail stock") threads = [] thread_num = len(stocks) ml = Money_link() for i in range(thread_num): threads.append( threading.Thread(target=money_link, args=( m, ml, stocks[i], ))) threads[i].start() for i in range(thread_num): threads[i].join() logging.info("Thread Done") # 每日交易明細 結束 if args.ep: date_list = create_random_date() threads = [] thread_num = 16 for i in range(thread_num): threads.append( threading.Thread(target=tse_daily_price_earning, args=(date_list, ))) threads[i].start() for i in range(thread_num): threads[i].join() logging.info("Thread Done") close_proxy()
def __init__(self): self.source_url = CYBERSYNDROME self.mongo = MongodbAPI()
class Crawl_Proxy(object): def __init__(self): self.source_url = CYBERSYNDROME self.mongo = MongodbAPI() def start(self): data = self.mongo.Get_Data_From("proxy", {'_id': 0}) if data is not None and datetime.now() - timedelta( hours=3) < data["update_date"] and len(data["iptable"]) > 250: logging.info("Use old proxies") return logging.info("start crawl proxy") self.mongo.DropAll("proxy") proxy_ip = self.paresrHTML() self.mongo.Insert_Data_To("proxy", { "_id": 0, "iptable": proxy_ip, "update_date": datetime.now() }) logging.info("add %04d ip" % (len(proxy_ip))) def paresrHTML(self): p = HtmlRequests() tree = p.get_html_noproxy(self.source_url) _as = [] _ps = [] if tree == None: return for i in tree.xpath('//div[@id="content"]/script/text()'): result = re.findall('\[[0-9 ,]*\]', i) _as = result[0].replace("[", '').replace("]", '') _ps = result[1].replace("[", '').replace("]", '') _as_list = [x for x in _as.split(',')] _ps_list = [x for x in _ps.split(',')] arithmetic = re.findall('\(.*?\)%\d*', i) n = self.decode(_ps_list, arithmetic[0]) _as = _as_list[n:] + _as_list[0:n] break headerlist = [] for i in tree.xpath('//tr'): headers = {} for j in i.xpath('td[6]/text()'): tmp = j.split(":") headers[tmp[0]] = tmp[1] headerlist.append(headers) return self.getproxy(_as, _ps_list, headerlist) def decode(self, ps, string): divisor = string.split(')')[1].replace('%', '') dividend = string.split(')')[0].replace('(', '') num = 0 for i in dividend.split('+'): if "*" in i: mult = 1 for k in i.split('*'): if "ps" in k: count = int(re.search('\d+', k).group(0)) mult *= int(ps[count]) else: mult *= int(k) num += mult else: if "ps" in i: count = int(re.search('\d+', i).group(0)) num += int(ps[count]) else: num += int(i) return num % int(divisor) def getproxy(self, _as, _ps, headerlist): proxy_ip = [] j = 0 ip = "" for i in range(len(_as)): if i % 4 == 3: ip += _as[i] proxy_ip.append({ 'ip': { 'http': ip + ':' + _ps[j] }, 'headers': headerlist[j] }) j += 1 ip = "" continue ip += _as[i] + '.' return proxy_ip
def __init__(self, date: datetime): self.mongo = MongodbAPI() self.htmlreq = HtmlRequests() self.__date = date pass
class Institutional_investors(): def __init__(self, date: datetime): self.mongo = MongodbAPI() self.htmlreq = HtmlRequests() self.__date = date pass def start(self): date = self.__date.strftime("%Y%m%d") source_url = TSELEGALPERSON.format(date=date) self.__crawl(source_url, self.__date.strftime("%Y/%m/%d")) pass def __crawl(self, url, date): json_data = self.htmlreq.get_json(requests, source_url=url) if json_data.get('stat', None) != "OK": logging.debug("This day not Opening :%s" % (date)) return data = self.__parser(json_data, date) err = self.mongo.Insert_Many_Data_To("stock_information", data) if err: logging.info("Insert Institutional investors to mongo , date: %s", date) else: logging.warn( "Fail to Insert Institutional investors to mongo , url: %s", url) def __parser(self, j, date) -> list: data = [] for i in j['data']: i = [x.replace(',', '') for x in i] if len(j['fields']) == 12: data.append({ '_id': str(i[0]) + "@" + date, 'date': datetime.strptime(date, "%Y/%m/%d"), 'stock_num': str(i[0]), 'foreign_investment_dealer_buy': float(i[2]), 'foreign_investment_dealer_sell': float(i[3]), 'foreign_investment_dealer_net_buy_sell': float(i[4]), 'institutional_investors_net_buy_sell': float(i[5]), 'investment_trust_buy': float(i[6]), 'investment_trust_sell': float(i[7]), 'investment_trust_net_buy_sell': float(i[8]), 'dealer_buy(Self-purchase)': float(i[9]), 'dealer_sell(Self-purchase)': float(i[10]), 'dealer_net_buy_sell': float(i[11]), }) elif len(j['fields']) < 18: data.append({ '_id': str(i[0]) + "@" + date, 'date': datetime.strptime(date, "%Y/%m/%d"), 'stock_num': str(i[0]), 'foreign_investment_buy': float(i[2]), 'foreign_investment_sell': float(i[3]), 'foreign_investment_net_buy_sell': float(i[4]), 'foreign_investment_dealer_buy': float(i[5]), 'foreign_investment_dealer_sell': float(i[6]), 'foreign_investment_dealer_net_buy_sell': float(i[7]), 'investment_trust_buy': float(i[8]), 'investment_trust_sell': float(i[9]), 'investment_trust_net_buy_sell': float(i[10]), 'dealer_net_buy_sell': float(i[11]), 'dealer_buy': float(i[12]), 'dealer_sell': float(i[13]), 'institutional_investors_net_buy_sell': float(i[14]), }) elif len(j['fields']) == 18: data.append({ '_id': str(i[0]) + "@" + date, 'date': datetime.strptime(date, "%Y/%m/%d"), 'stock_num': str(i[0]), 'foreign_investment_buy': float(i[2]), 'foreign_investment_sell': float(i[3]), 'foreign_investment_net_buy_sell': float(i[4]), 'foreign_investment_dealer_buy': float(i[5]), 'foreign_investment_dealer_sell': float(i[6]), 'foreign_investment_dealer_net_buy_sell': float(i[7]), 'investment_trust_buy': float(i[8]), 'investment_trust_sell': float(i[9]), 'investment_trust_net_buy_sell': float(i[10]), 'dealer_net_buy_sell': float(i[11]), 'dealer_buy(Self-purchase)': float(i[12]), 'dealer_sell(Self-purchase)': float(i[13]), 'dealer_net_buy_sell(Self-purchase)': float(i[14]), 'dealer_buy(Hedging)': float(i[15]), 'dealer_sell(Hedging)': float(i[16]), 'dealer_net_buy_sell(Hedging)': float(i[17]), 'institutional_investors_net_buy_sell': float(i[18]), }) return data
class Daily_stock_info(object): def __init__(self, date): self.__mongo = MongodbAPI() self.__htmlreq = HtmlRequests() self.__date = date pass def start(self): date = self.__date.strftime("%Y%m%d") source_url = DAILYSTOCKINFO.format(date=date) data = self.__crawl(source_url, self.__date.strftime("%Y/%m/%d")) if data != None: err = self.__mongo.Insert_Many_Data_To('stock_daily_info', data) if err: logging.info("Insert stock daily info to mongo , date: %s", date) else: logging.warn( "Fail to Insert stock daily info to mongo , url: %s", source_url) return def __crawl(self, url, date): for i in range(10): j = self.__htmlreq.get_json(requests, url) if j == {} or j['stat'] != 'OK': return None rows = [] if 'data5' in j: rows = [ x for x in j['data5'] if len(x[0]) == 4 and x[-1] != '0.00' ] elif 'data4' in j: rows = [ x for x in j['data4'] if len(x[0]) == 4 and x[-1] != '0.00' ] else: logging.warn("The daily info not have data5 or data4 url: %s", url) return None data = self.__parser(date, rows) return data else: logging.error("Fail to parser daily stock info , url: %s", url) def __parser(self, date, rows: list) -> list: data = [] for i in rows: data.append({ '_id': i[0] + "@" + date, 'stock': i[0], 'date': datetime.strptime(date, "%Y/%m/%d"), 'ts': int(datetime.timestamp(datetime.strptime(date, "%Y/%m/%d"))), 'transaction': float(i[3].replace(',', '')), 'open': self.__get_float(i[5]), 'high': self.__get_float(i[6]), 'low': self.__get_float(i[7]), 'close': self.__get_float(i[8]), 'change': self.__get_sign_float(i[9], i[10]), 'price_earning': float(i[-1].replace(',', '')), }) return data def __get_sign_float(self, sign, num) -> float: if "-" in sign: return float("-" + num) elif "+" in sign: return float(num) else: return 0.0 def __get_float(self, num) -> float: if num.replace(',', '') == 'X0.00': return 0.0 elif num == '--': return None else: return float(num.replace(',', ''))
def __init__(self): self.mongo = MongodbAPI()
class TWSE_realtime(): def __init__(self, stock_num): self.mongo = MongodbAPI() self.stock_num = stock_num self.htmlreq = HtmlRequests() self.req = self.htmlreq.get_session(SESSIONURL) now = datetime.now() self.stop_date = datetime(now.year, now.month, now.day, 13, 30, 10) def start(self): self.crawl() def crawl(self): now = datetime.now() if now < self.stop_date: threading.Timer(5.0, self.crawl).start() now_time = int(time.time()) * 1000 source_url = TWSEREALTIMEURL.format( stock_num=self.stock_num, time=now_time) json_data = self.htmlreq.get_json(self.req, source_url) data = self.parser(json_data) if data == None: return e = self.mongo.CheckExists('Realtime_data', data.get('_id', None)) if e == False: for i in range(5): err = self.mongo.Insert_Data_To("Realtime_data", data) if err: logging.info("Insert realtime data to mongo, id:%s" % (data.get("_id"))) return else: logging.error( "Fail to insert realtime data to mongo, id:%s" % (data.get("_id"))) def parser(self, j: json): # Process best result if len(j['msgArray']) == 0: return None data = j['msgArray'][0] def _split_best(d): if d: return d.strip('_').split('_') return d time = datetime.fromtimestamp( int(data['tlong']) / 1000).strftime('%Y-%m-%d %H:%M:%S') date = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') return { "_id": str(self.stock_num) + "@"+time, "code": self.stock_num, 'ts': int(time.mktime(date.timetuple())), "time": date, "latest_trade_price": float(data.get('z', None)), "trade_volume": float(data.get('tv', None)), "accumulate_trade_volume": float(data.get('v', None)), "best_bid_price": [float(x) for x in _split_best(data.get('b', None))], "best_bid_volume": [float(x) for x in _split_best(data.get('g', None))], "best_ask_price": [float(x) for x in _split_best(data.get('a', None))], "best_ask_volume": [float(x) for x in _split_best(data.get('f', None))], "open": float(data.get('o', None)), "high": float(data.get('h', None)), "low": float(data.get('l', None)) }
class TWSE_daily(): def __init__(self, stock_num: str, start_year: int, start_month: int): self.mongo = MongodbAPI() self.stock_num = stock_num self.htmlreq = HtmlRequests() self.req = self.htmlreq.get_session(SESSIONURL) self.req.keep_alive = False self.start_year = start_year self.start_month = start_month self.now_date = datetime.now() self.retry = 0 def start(self): now_year = self.start_year now_month = self.start_month self.crawl(now_year, now_month) def crawl(self, year, month): logging.debug("%s/%s" % (year, month)) source_url = TWSEREALTIMEURL.format( stock_num=self.stock_num, time="%d%02d01" % (year, month)) json_data = self.htmlreq.get_json(self.req, source_url) if json_data == {} and self.retry < 5: self.retry += 1 self.crawl(year, month) else: logging.error("Can't get old daily stock %s@%s-%s ,url : %s " % (self.stock_num, year, month, source_url)) self.retry = 0 data = self.parser(json_data.get('data', None)) if data != None and len(data) > 0: for i in range(5): err = self.mongo.Insert_Many_Data_To("Daily_data", data) if err == True: logging.info("Insert Daily data %s@%s-%s" % (self.stock_num, year, month)) break else: logging.error("Fail, Insert Daily data to Mongo,id: %s@%s-%s" % (self.stock_num, year, month)) date = self._get_next_date(year, month) if date['year'] >= self.now_date.year and date['month'] > self.now_date.month: logging.info("Done crawl Daily data , %s@%s/%s" % (self.stock_num, date['year'], date['month'])) return # Start to crawl new year, month self.crawl(date["year"], date["month"]) def _convert_date(self, date): """Convert '106/05/01' to '2017/05/01'""" return '/'.join([str(int(date.split('/')[0]) + 1911)] + date.split('/')[1:]) def parser(self, j: json) -> list: data = [] if j == None: return data for item in j: date = datetime.strptime( self._convert_date(item[0]), '%Y/%m/%d') _id = self.stock_num + "@"+date.strftime("%Y/%m/%d") e = self.mongo.CheckExists( "Daily_data", _id) if e: logging.debug("Insert Daily data ,id :%s exists" % (_id)) continue try: data.append({ '_id': _id, 'stock': self.stock_num, 'date': date, 'ts': int(time.mktime(date.timetuple())), 'capacity': int(item[1].replace(',', '')), 'turnover': int(item[2].replace(',', '')), 'open': self._get_float(item[3]), 'high': self._get_float(item[4]), 'low': self._get_float(item[5]), 'close': self._get_float(item[6]), 'change': self._get_float(item[7]), 'transaction': int(item[8].replace(',', '')) }) except Exception as e: logging.error("daily data fail :%s %s" % (item, e)) continue return data def _get_next_date(self, year, month) -> dict: if month < 12: month += 1 else: year += 1 month = 1 return { 'year': year, 'month': month } def _get_float(self, number: str): if number.replace(',', '') == 'X0.00': return 0.0 elif number == '--': return None else: return float(number.replace(',', '')) return