class Task(object): def __init__(self, id: str, func: Callable, param: Any = {}) -> None: super().__init__() self.id = id self.func = func self.param = param self.logger = Logger("Task") self.loop: Optional[AbstractEventLoop] = None async def run(self, taskPool: TaskPool, pool: Pool) -> None: self.logger.info("run", "task run") if self.loop: self.param["taskPool"] = taskPool self.param["pool"] = pool await self.loop.create_task(self.func(**self.param))
class Pool(object): def __init__(self) -> None: super().__init__() self.isRun = False self.logger = Logger("Pool") self.task: Optional[Task] = None self.taskId = "" def setTask(self, task: Task) -> None: self.task = task self.taskId = task.id def run(self, taskPool: TaskPool) -> None: self.isRun = True self.logger.info("run", "task pool run") if self.task is not None: self.poolTask = asyncio.ensure_future(self.task.run( taskPool, self)) def cancel(self) -> None: self.isRun = False self.logger.info("cancel", "task pool cancel") if self.poolTask and not self.poolTask.cancelled(): self.poolTask.cancel()
def marcapJob(marcapDtos: List[StockRunCrawling]) -> None: service: StockService = Locator.getInstance().get(StockService) logger = Logger("TaskService_marcapJob") for dto in marcapDtos: logger.info("#### schedule job start ####") logger.info("command" + dto.startDateStr + "~" + dto.endDateStr) dto.taskUniqueId = dto.taskId + dto.market+dto.startDateStr + dto.endDateStr + str(uuid.uuid4()) if dto.isNow: dto.startDateStr = getNowDateStr() dto.endDateStr = getNowDateStr() logger.info("real:" + dto.startDateStr + "~" + dto.endDateStr) service.crawlingMarcapStockData(marcapDtos)
class MarcapCrawler(object): def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.logger = Logger("MarcapCrawler") def createUUID(self) -> str: return str(uuid.uuid4()) async def connectWebDriver(self, addr: str, uuid: str) -> WebDriver: chrome_options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values.automatic_downloads': 1, 'download.default_directory': f"/home/seluser/Downloads/{uuid}" } chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Remote( command_executor=addr, options=chrome_options, ) driver.set_page_load_timeout(60) driver.set_script_timeout(60) self.logger.info("connectWebDriver", "create driver") return driver def connectLocalDriver(self, addr: str, uuid: str) -> WebDriver: chrome_options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values.automatic_downloads': 1, 'download.default_directory': f"/Users/iseongjae/Documents/PersonalProjects/fin-web/fin-crawling-server/server/downloads/{uuid}" } chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(executable_path="/Users/iseongjae/Downloads/chromedriver", chrome_options=chrome_options) return driver async def crawling(self, dto: StockRunCrawling) -> None: driver = None downloadObserver = None try: uuid = self.createUUID() self.logger.info("crawling", uuid) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, dto) downloadObserver = DownloadObserver() path = await asyncRetryNonBlock(5, 1, downloadObserver.makePath, uuid) downloadObserver.startObserver(path, self.ee) self.logger.info("crawling", "create observer and start") print("startObserver") driver = await asyncRetryNonBlock(5, 1, self.connectWebDriver, dto.driverAddr, uuid) print("connectWebDriver") driver.get("http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020101") try: alert = WebDriverWait(driver, timeout=3).until(EC.alert_is_present()) alert.accept() except Exception as e: print("예외발생:"+str(e)) print("start:"+dto.startDateStr) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, dto) WebDriverWait(driver, timeout=20, poll_frequency=1).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mktId_0_1"))) date = datetime.strptime(dto.startDateStr, "%Y%m%d") endDate = datetime.strptime(dto.endDateStr, "%Y%m%d") while date <= endDate: dateStr = date.strftime("%Y%m%d") downloadTask = StockCrawlingDownloadTask(**{ "dateStr": dateStr, "market": dto.market, "uuid": uuid, "taskId": dto.taskId, "taskUniqueId": dto.taskUniqueId }) self.logger.info("crawling", f"create downloadTask taskId: {dto.taskId} market: {dto.market} date: {dateStr} taskUniqueId: {dto.taskUniqueId}") print(downloadTask.json()) downloadObserver.event_handler.setDownloadTask(downloadTask) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, downloadTask) await asyncRetryNonBlock(5, 1, self.downloadData, downloadTask, downloadObserver, driver) # await self.downloadData(downloadTask, downloadObserver, driver) date = date + timedelta(days=1) except Exception as e: raise e finally: if downloadObserver: downloadObserver.stopObserver() if driver: driver.quit() async def downloadData(self, downloadTask: StockCrawlingDownloadTask, downloadObserver: DownloadObserver, driver: WebDriver) -> None: self.logger.info("downloadData") if driver is None: return # pymitter before = driver.execute_script("return $('.CI-MDI-UNIT-TIME').text()") if downloadTask.market == "kospi": driver.execute_script('$("#mktId_0_1").click()') elif downloadTask.market == "kosdaq": driver.execute_script('$("#mktId_0_2").click()') elif downloadTask.market == "konex": driver.execute_script('$("#mktId_0_3").click()') # driver.implicitly_wait(1) driver.execute_script(f'$("#trdDd")[0].value = "{downloadTask.dateStr}"') # driver.implicitly_wait(1) driver.execute_script('$(".btn_component_search").click()') # driver.implicitly_wait(1) after = before while before == after: after = driver.execute_script('return $(".CI-MDI-UNIT-TIME").text()') await sleepNonBlock(0.5) # driver.implicitly_wait(1) print("before:"+before) print("after:"+after) await sleepNonBlock(3) WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[class='CI-MDI-UNIT-DOWNLOAD']"))) driver.execute_script("$('[class=\"CI-MDI-UNIT-DOWNLOAD\"]').click()") WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[data-type='csv']"))) driver.execute_script("$(\"[data-type='csv']\").click()") print("wait:"+downloadTask.dateStr) loop = asyncio.get_running_loop() queue: asyncio.Queue = asyncio.Queue(maxsize=1, loop=loop) async def fileResultOfData(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: result = {} result["event"] = event result["downloadTask"] = downloadTask await queue.put(result) @self.ee.once(FILE_SYSTEM_HANDLER(downloadTask.uuid)) def downloadComplete(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: loop.create_task(fileResultOfData(event, downloadTask)) try: result = await asyncio.wait_for(queue.get(), timeout=30) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, downloadTask) await asyncio.create_task(self.makeMarcapData(result["event"], result["downloadTask"])) except Exception as e: raise e finally: queue.task_done() def convertFileToDto(self, path: str, dto: StockMarketCapitalResult) -> None: lines = [] with open(path, "r", encoding="utf-8") as f: # p = Path(f.name) # dto.date = p.stem lines = f.readlines() for i in range(1, len(lines)): data = lines[i].replace('"', '').split(",") if dto.market == "kospi": marcap = StockMarketCapital(**{ "date": dto.date, "market": dto.market, "code": data[0].strip(), "name": data[1].strip(), "close": data[2].strip(), "diff": data[3].strip(), "percent": data[4].strip(), "open": data[5].strip(), "high": data[6].strip(), "low": data[7].strip(), "volume": data[8].strip(), "price": data[9].strip(), "marcap": data[10].strip(), "number": data[11].strip() }) else: marcap = StockMarketCapital(**{ "date": dto.date, "market": dto.market, "code": data[0].strip(), "name": data[1].strip(), "close": data[3].strip(), "diff": data[4].strip(), "percent": data[5].strip(), "open": data[6].strip(), "high": data[7].strip(), "low": data[8].strip(), "volume": data[9].strip(), "price": data[10].strip(), "marcap": data[11].strip(), "number": data[12].strip() }) # print("append marcap: " + str(marcap)) dto.data.append(marcap) async def isExistFile(self, path: str, ext: str = ".csv") -> bool: isExist = path.endswith(ext) restTimes = 3 while not isExist and restTimes >= 0: await sleepNonBlock(1) isExist = path.endswith(ext) restTimes -= 1 return isExist async def parseReceivedFile(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: retdto = StockMarketCapitalResult() date = downloadTask.dateStr market = downloadTask.market retdto.date = date retdto.market = market isExist = await self.isExistFile(event.src_path) if not isExist: return print("created: " + date) await sleepNonBlock(0.5) dest_path = f'{os.path.dirname(event.src_path)}/{market+"-"+date}.csv' if os.path.isfile(dest_path): return self.changeCharSet(event.src_path) os.rename(event.src_path, dest_path) self.convertFileToDto(dest_path, retdto) retdto.result = "success" self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, True, retdto, downloadTask) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, downloadTask, retdto) self.logger.info("parseFile", f"success, {downloadTask.taskUniqueId}") async def makeMarcapData(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: try: await asyncRetry(3, 1, self.parseReceivedFile, event, downloadTask) except Exception: retdto = StockMarketCapitalResult() retdto.result = "fail" retdto.errorMsg = traceback.format_exc() self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, False, retdto, downloadTask) self.logger.error("parseFile", f"fail, {downloadTask.taskUniqueId} error: {traceback.format_exc()}") finally: self.logger.info("parseFile...") def changeCharSet(self, path: str) -> None: lines = None with open(path, "r", encoding="euc-kr") as f: lines = f.readlines() with open(path, 'w', encoding="utf-8") as f: f.writelines(lines)
class StockService: def __init__(self, stockRepository: StockRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository) -> None: self.stockRepository = stockRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.logger = Logger("StockService") async def getStockData(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]: return await self.stockRepository.getStockData(market, startDate, endDate) def crawlingMarcapStockData(self, dtoList: List[StockRunCrawling]) -> None: self.logger.info("crawlingMarcapStockData", str(len(dtoList))) for dto in dtoList: if dto.taskId == "marcap": async def marcapTaskWorker(runDto: StockRunCrawling, pool: Pool, taskPool: TaskPool) -> None: try: self.logger.info("runCrawling&marcapTaskWorker", "start") marcapCrawler = MarcapCrawler() taskUniqueId = runDto.taskUniqueId self.crawlerRepository.addCrawler( taskUniqueId, marcapCrawler) self.createListners(marcapCrawler.ee) self.logger.info("runCrawling&marcapTaskWorker", f"taskWorker:{taskUniqueId}") await marcapCrawler.crawling(runDto) taskPool.removeTaskPool(pool) self.crawlerRepository.removeCrawler(taskUniqueId) except asyncio.CancelledError: self.logger.info("convertFactorFileToDbTask", "cancel") except Exception: self.logger.error("convertFactorFileToDbTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(runDto, traceback.format_exc()) workerTask = Task(dto.taskUniqueId, marcapTaskWorker, {"runDto": dto}) if self.tasksRepository.taskRunner: if self.tasksRepository.isExistTask( dto.taskId, dto.taskUniqueId): return startDate = datetime.strptime(dto.startDateStr, "%Y%m%d") endDate = datetime.strptime(dto.endDateStr, "%Y%m%d") taskDates = [ (startDate + timedelta(days=x)).strftime("%Y%m%d") for x in range((endDate - startDate).days + 1) ] task = ProcessTask( **{ "market": dto.market, "startDateStr": dto.startDateStr, "endDateStr": dto.endDateStr, "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": len(taskDates), "tasks": deque(taskDates), "restCount": len(taskDates), "tasksRet": deque(([0] * len(taskDates))), }) task.state = "find worker" self.tasksRepository.addTask(task) self.tasksRepository.runTask(workerTask) self.logger.info("runMarcapTask", f"runTask {task.json()}") def createListners(self, ee: EventEmitter) -> None: ee.on(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, self.onResultOfStockData) ee.on(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, self.onConnectingWebDriver) ee.on(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, self.onStartCrawling) ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, self.onDownloadStart) ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, self.onDownloadComplete) ee.on(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, self.onParsingComplete) ee.on(EVENT_MARCAP_CRAWLING_ON_ERROR, self.onError) ee.on(EVENT_MARCAP_CRAWLING_ON_CANCEL, self.onCancelled) # 주식 종목 데이터 크롤링 결과값을 db에 저장한다. def onResultOfStockData(self, dto: StockCrawlingDownloadTask, retDto: StockMarketCapitalResult) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "insert to database" self.tasksRepository.updateTask(task) async def completeMarcapTask() -> None: await self.stockRepository.insertMarcap(retDto) self.tasksRepository.completeStockCrawlingTask(True, retDto, dto) asyncio.create_task(completeMarcapTask()) # 크롤링 중 웹드라이버와 연결되었을 때 이벤트 def onConnectingWebDriver(self, dto: StockRunCrawling) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "connecting webdriver" self.tasksRepository.updateTask(task) self.logger.info("onConnectingWebDriver", task.taskUniqueId) # 크롤링이 시작되었을 떄 이벤트 def onStartCrawling(self, dto: StockRunCrawling) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "start crawling" self.tasksRepository.updateTask(task) self.logger.info("onStartCrawling", task.taskUniqueId) # 크롤링 데이터 다운로드가 시작되었을 때 이벤트 def onDownloadStart(self, dto: StockCrawlingDownloadTask) -> None: # self.logger.info("onDownloadStart: "+dto.json()) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download start" self.tasksRepository.updateTask(task) self.logger.info("onDownloadStart", task.taskUniqueId) # 크롤링 데이터 다운로드가 완료되었을 때 이벤트 def onDownloadComplete(self, dto: StockCrawlingDownloadTask) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download complete" self.tasksRepository.updateTask(task) self.logger.info("onDownloadComplete", task.taskUniqueId) # 크롤링 데이터 변환이 완료되었을 때 이벤트 def onParsingComplete(self, isSuccess: bool, retdto: StockMarketCapitalResult, dto: StockCrawlingDownloadTask) -> None: self.logger.info("onParsingComplete") self.logger.info(f"taskId:{dto.taskId} taskUniqueId{dto.taskUniqueId}") tar = self.tasksRepository.tasksdto.tasks[dto.taskId]["list"] self.logger.info(f"taskDTO: {tar}") if not isSuccess: self.tasksRepository.completeStockCrawlingTask( isSuccess, retdto, dto) # 크롤링이 취소되었을 때 이벤트 def onCancelled(self, dto: StockRunCrawling) -> None: self.logger.info("onCancelled") # self.tasksRepository.updateAllTask() # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) # self.tasksRepository.fail(task, task.restCount) # task.state = "cancelled" # self.tasksRepository.updateTask(task) # self.logger.info("onCancelled", task.taskUniqueId) # 크롤링이 에러가났을 때 이벤트 def onError(self, dto: StockRunCrawling, errorMsg: str) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) self.tasksRepository.fail(task, task.restCount) task.state = "error" task.errMsg = errorMsg self.tasksRepository.updateTask(task) self.logger.error("onError", task.taskUniqueId)
class FactorService: def __init__(self, manager: ConnectionManager, factorRepository: FactorRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository, taskService: 'TaskService') -> None: self.manager = manager self.factorRepository = factorRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.taskService = taskService self.logger = Logger("FactorService") async def getFactor(self, code: str, year: str, month: str, source: str) -> List[FactorData]: return await self.factorRepository.getFactor(code, year, month, source) def crawlingFactorDartData(self, dto: DartApiCrawling) -> None: async def crawlingFactorDartDataTask(pool: Pool, taskPool: TaskPool) -> None: # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) try: crawler = DartApiCrawler() self.crawlerRepository.addCrawler(dto.taskUniqueId, crawler) self.createFactorDartListener(crawler.ee) await crawler.crawling(dto) self.crawlerRepository.removeCrawler(dto.taskUniqueId) except asyncio.CancelledError: self.logger.info("crawlingFactorDartDataTask", "cancel") except Exception: self.logger.error("crawlingFactorDartDataTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(dto, traceback.format_exc()) finally: taskPool.removeTaskPool(pool) count = dto.endYear - dto.startYear + 1 task = ProcessTask( **{ "market": "", "startDateStr": dto.startYear, "endDateStr": dto.endYear, "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": count, "tasks": list(range(dto.startYear, dto.endYear + 1)), "restCount": count, "tasksRet": [0] * count, "state": "find worker" }) self.tasksRepository.addTask(task) workerTask = Task(dto.taskUniqueId, crawlingFactorDartDataTask) self.tasksRepository.runTask(workerTask) # file에 있는 factor를 db에 저장한다. def convertFactorFileToDb(self, dto: RunFactorFileConvert) -> None: self.logger.info("convertFactorFileToDb") async def convertFactorFileToDbTask(pool: Pool, taskPool: TaskPool) -> None: try: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) data = await asyncio.create_task( self.factorRepository.getFactorsInFile()) task.state = "make Factor Object" self.tasksRepository.updateTask(task) daoList = await batchFunction(100, data, self.makeFactorDaoList) task.state = "start insert db" self.tasksRepository.updateTask(task) self.logger.info("convertFactorFileToDbTask", f"insertCount: {str(len(daoList))}") await self.factorRepository.insertFactor(daoList) task.state = "complete" self.tasksRepository.completeFactorConvertFileToDbTask(task) except asyncio.CancelledError: self.logger.info("convertFactorFileToDbTask", "cancel") except Exception: self.logger.error("convertFactorFileToDbTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(dto, traceback.format_exc()) finally: taskPool.removeTaskPool(pool) task = ProcessTask( **{ "market": "", "startDateStr": "20070101", "endDateStr": "20191231", "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": 1, "tasks": ["convert"], "restCount": 1, "tasksRet": [0], "state": "start get file" }) self.tasksRepository.addTask(task) workerTask = Task(dto.taskUniqueId, convertFactorFileToDbTask) self.tasksRepository.runTask(workerTask) async def makeFactorDaoList(self, data: List[Dict]) -> List[FactorDao]: daoList = [] for one in data: dao = FactorDao( **{ "code": one["종목코드"], # 종목코드 "name": one["종목명"], # 종목이름 "dataYear": one["년"], # 결산년 "dataMonth": one["결산월"], # 결산월 "dataName": one["데이터명"], # 데이터명 "dataValue": ( one["데이터값"] * 1000) if one["단위"] == "천원" else one["데이터값"] # 데이터값 }) daoList.append(dao) return daoList def createFactorDartListener(self, ee: EventEmitter) -> None: ee.on(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, self.onDownloadingCodes) ee.on(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, self.onCrawlingFactorData) ee.on(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, self.onCompleteYear) ee.on(EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, self.onResultOfFactor) ee.on(EVENT_DART_API_CRAWLING_ON_CANCEL, self.onCancelled) def onDownloadingCodes(self, dto: DartApiCrawling) -> None: self.logger.info("onDownloadingCodes", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download Codes" self.tasksRepository.updateTask(task) def onCrawlingFactorData(self, dto: DartApiCrawling) -> None: self.logger.info("onCrawlingFactorData", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "crawling factor data" self.tasksRepository.updateTask(task) def onCompleteYear(self, dto: DartApiCrawling, year: int) -> None: self.logger.info("onCompleteYear", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) self.tasksRepository.completeFactorDart(task, year) def onResultOfFactor(self, dto: DartApiCrawling, year: int, obj: List) -> None: self.logger.info("onResultOfFactor", dto.taskUniqueId) listOfFactorDao = list( map( lambda one: FactorDao( **{ "code": one["crawling_code"], "name": one["crawling_name"], "dataYear": one["bsns_year"], "dataMonth": getMonthFromReprtCode(one["reprt_code"]), "dataName": one["account_nm"], "dataValue": one["thstrm_amount"], "dataId": one["account_id"] }), obj)) asyncio.create_task( self.factorRepository.insertFactorDart(listOfFactorDao)) def onCancelled(self, dto: DartApiCrawling) -> None: self.logger.info("onCancelled")
class DartApiCrawler(object): def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.isLock = False self.isCancelled = False self.logger = Logger("DartApiCrawler") def createUUID(self) -> str: return str(uuid.uuid4()) async def downloadCodes(self, isCodeNew: bool, apiKey: str) -> Dict: if "pytest" in sys.modules: # savepath = Path('factors/codes.zip') loadpath = Path('factors/codes') datapath = Path("factors/codes/CORPCODE.xml") else: # savepath = Path('app/static/factors/codes.zip') loadpath = Path('app/static/factors/codes') datapath = Path("app/static/factors/codes/CORPCODE.xml") if isCodeNew or not os.path.exists(datapath.resolve()): # user_agent = UserAgent(cache=False, use_cache_server=True) headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'", 'accept-language': 'ko' } params = {"crtfc_key": apiKey} url = "https://opendart.fss.or.kr/api/corpCode.xml" async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers=headers) as response: data = await response.read() ZipFile(io.BytesIO(data)).extractall(loadpath.resolve()) tree = ET.parse(datapath.resolve()) codes: Dict[str, Any] = {} for li in tree.findall("list"): el = li.find("stock_code") if el is not None: stockCode = el.text if isinstance(stockCode, str) and len(stockCode) == 6: codeEl = li.find("corp_code") nameEl = li.find("corp_name") if codeEl is not None: codes[stockCode] = {} codes[stockCode]["corp_code"] = codeEl.text if nameEl is not None: codes[stockCode]["corp_name"] = nameEl.text return codes async def crawling(self, dto: DartApiCrawling) -> None: # cpu bound 작업 try: if dto.startYear < 2015: dto.startYear = 2015 self.ee.emit(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, dto) codes = await asyncRetryNonBlock(5, 1, self.downloadCodes, isCodeNew=dto.isCodeNew, apiKey=dto.apiKey) # codes = self.downloadCodes(dto.isCodeNew, dto.apiKey) self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto) for year in range(dto.startYear, dto.endYear + 1): self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto) self.logger.info("crawling", str(len(codes))) for code in codes: # newDf = self.getYearDf(dart, code, codes, year) newDf = await asyncRetryNonBlock(5, 1, self.getYearDf, dto.apiKey, code, codes, year) if self.isCancelled: self.ee.emit(EVENT_DART_API_CRAWLING_ON_CANCEL, dto) if newDf is not None: self.logger.info("crawling", code) self.ee.emit( EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, dto, year, newDf.to_dict("records")) # yearDf = await self.getYearDf(dart, code, codes, year, yearDf) self.ee.emit(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, dto, year) self.logger.info("crawling", str(year)) except Exception as e: raise e async def getYearDf(self, apiKey: str, code: str, codes: Dict, year: int) -> pd.DataFrame: self.logger.info("getYearDf", f"crawling: {code}") df = None try: url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json' # user_agent = UserAgent(cache=False, use_cache_server=True) headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'", 'accept-language': 'ko', } params = { 'crtfc_key': apiKey, 'corp_code': codes[code]["corp_code"], 'bsns_year': year, # 사업년도 'reprt_code': "11011", # "11011": 사업보고서 'fs_div': "CFS", # "CFS":연결재무제표, "OFS":재무제표 } connector = aiohttp.TCPConnector(limit=50, force_close=True) async with aiohttp.ClientSession(connector=connector) as session: timeout = aiohttp.ClientTimeout(total=15) # async with session.get(url, params=params, headers=headers) as response: async with session.get(url, params=params, timeout=timeout, headers=headers) as response: data = await response.json() if 'list' not in data: return None df = pd.json_normalize(data, 'list') # df = dart.finstate_all(code, year) # df = await asyncio.create_task(dart.finstate_all(code, year)) # df = await loop.run_in_executor(self.pool, dart.finstate_all, code, year) except Exception as e: self.logger.error("getYearDf", traceback.format_exc()) raise e self.logger.info("df", str(df)) if df is not None: df["crawling_year"] = year df["crawling_code"] = code df["crawling_name"] = codes[code]["corp_name"] name = codes[code]["corp_name"] self.logger.info("getYearDf", f"{str(year)} {str(code)} {str(name)}") return df # allCodeDf = pd.concat([allCodeDf, df]) # return allCodeDf return None
class TaskService: def __init__( self, manager: ConnectionManager, tasksRepository: TasksRepository, taskScheduler: TaskScheduler, factorService: FactorService, stockService: StockService, crawlerRepository: CrawlerRepository ) -> None: self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.manager = manager self.taskScheduler = taskScheduler self.factorService = factorService self.stockService = stockService self.logger = Logger("TaskService") self.ee = self.tasksRepository.taskEventEmitter self.setupEvents() def setupEvents(self) -> None: self.ee.on(EVENT_TASK_REPO_UPDATE_TASKS, self.fetchTasks) self.ee.on(EVENT_TASK_REPO_TASK_COMPLETE, self.updateTaskState) self.ee.on(EVENT_TASK_REPO_UPDATE_POOL_INFO, self.updateTaskPoolInfo) def getTaskSchedule(self, webSocket: WebSocket, isBroadCast: bool = False) -> None: jobs = self.taskScheduler.getJobs() stockTaskScheduleList = StockTaskScheduleList(**{"list": []}) for i in range(len(jobs)): fields = jobs[i].trigger.fields id = jobs[i].id self.logger.info(f"jobargs: {str(jobs[i].args[0])}") stockTaskScheduleList.list.append(StockTaskScheduleInfo(**{ "id": id, "year": str(fields[0]), "month": str(fields[1]), "day": str(fields[2]), "dayOfWeek": str(fields[4]), "hour": str(fields[5]), "minute": str(fields[6]), "second": str(fields[7]), "taskList": list(jobs[i].args[0]) })) if isBroadCast: self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASK_SCHEDULE, stockTaskScheduleList.dict()) else: self.manager.send(RES_SOCKET_TASK_FETCH_TASK_SCHEDULE, stockTaskScheduleList.dict(), webSocket) @staticmethod def marcapJob(marcapDtos: List[StockRunCrawling]) -> None: service: StockService = Locator.getInstance().get(StockService) logger = Logger("TaskService_marcapJob") for dto in marcapDtos: logger.info("#### schedule job start ####") logger.info("command" + dto.startDateStr + "~" + dto.endDateStr) dto.taskUniqueId = dto.taskId + dto.market+dto.startDateStr + dto.endDateStr + str(uuid.uuid4()) if dto.isNow: dto.startDateStr = getNowDateStr() dto.endDateStr = getNowDateStr() logger.info("real:" + dto.startDateStr + "~" + dto.endDateStr) service.crawlingMarcapStockData(marcapDtos) def addTaskSchedule(self, scheduleDto: StockTaskSchedule, runCrawlingDto: List[StockRunCrawling], webSocket: WebSocket) -> None: marcapDtos = [] for dto in runCrawlingDto: if dto.taskId == "marcap": marcapDtos.append(dto) self.taskScheduler.addJob( self.marcapJob, scheduleDto.year, scheduleDto.month, scheduleDto.dayOfWeek, scheduleDto.day, scheduleDto.hour, scheduleDto.minute, scheduleDto.second, "marcap", args=[marcapDtos]) self.getTaskSchedule(webSocket, True) def removeTaskSchedule(self, id: str, webSocket: WebSocket) -> None: self.taskScheduler.removeJob(id) self.getTaskSchedule(webSocket, True) def fetchTasks(self, data: ProcessTasks = None, websocket: WebSocket = None) -> None: if data is None: data = self.tasksRepository.tasksdto self.logger.info("fetchTasks", data.json()) if websocket is None: self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASKS, data.dict()) else: self.manager.send(RES_SOCKET_TASK_FETCH_TASKS, data.dict(), websocket) def getTaskState(self, taskId: str, webSocket: WebSocket) -> None: data: YearData = self.tasksRepository.getAllTaskState(taskId) self.manager.send(RES_SOCKET_TASK_FETCH_TASK_STATE, data.dict(), webSocket) def updateTaskState(self, taskId: str, stockUpdateState: StockUpdateState = None) -> None: if stockUpdateState is not None: self.manager.sendBroadCast(RES_SOCKET_TASK_UPDATE_TASK_STATE, stockUpdateState.dict()) self.fetchTasks() def getTaskPoolInfo(self, webSocket: WebSocket) -> None: taskPoolInfo: TaskPoolInfo = self.tasksRepository.getPoolInfo() self.manager.send(RES_SOCKET_TASK_FETCH_TASK_POOL_INFO, taskPoolInfo.dict(), webSocket) def updateTaskPoolInfo(self, poolInfo: TaskPoolInfo) -> None: # logger.info(f"updateTaskPoolInfo:{poolInfo.json()}") self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASK_POOL_INFO, poolInfo.dict()) def addTask(self, taskName: str, dto: Any) -> None: if isinstance(dto, dict): if taskName == "crawlingMarcapStockData": data = [] for market in dto["market"]: taskUniqueId = dto["taskId"]+market+dto["startDate"]+dto["endDate"]+str(uuid.uuid4()) dtoOne = StockRunCrawling(**{ "driverAddr": "http://fin-carwling-webdriver:4444", "market": market, "startDateStr": dto["startDate"], "endDateStr": dto["endDate"], "taskId": dto["taskId"], "taskUniqueId": taskUniqueId }) data.append(dtoOne) elif taskName == "convertFactorFileToDb": data = RunFactorFileConvert(**{ "taskId": dto["taskId"], "taskUniqueId": dto["taskId"] + str(uuid.uuid4()) }) elif taskName == "crawlingFactorDartData": data = DartApiCrawling(**{ "apiKey": dto["apiKey"], "isCodeNew": dto["isCodeNew"], "startYear": dto["startYear"], "endYear": dto["endYear"], "taskId": dto["taskId"], "taskUniqueId": dto["taskId"] + dto["startYear"] + dto["endYear"] + str(uuid.uuid4()) }) else: data = dto if taskName == "convertFactorFileToDb": self.factorService.convertFactorFileToDb(data) elif taskName == "crawlingMarcapStockData": self.stockService.crawlingMarcapStockData(data) elif taskName == "crawlingFactorDartData": self.factorService.crawlingFactorDartData(data) def cancelTask(self, taskId: str, taskUniqueId: str) -> None: if taskUniqueId in self.crawlerRepository.getCrawlers(): self.crawlerRepository.getCrawler(taskUniqueId).isCancelled = True self.tasksRepository.taskRunner.cancel(taskUniqueId) task = self.tasksRepository.getTask(taskId, taskUniqueId) if task is not None: if task.state == "cancel": self.tasksRepository.deleteTask(task) self.tasksRepository.updateAllTask() elif task.state == "error": self.tasksRepository.deleteTask(task) self.tasksRepository.updateAllTask() else: task.state = "cancel" self.tasksRepository.updateTask(task) else: self.tasksRepository.updateAllTask() def fetchCompletedTask(self, dto: ListLimitData, webSocket: WebSocket) -> None: dao = ListLimitDao(**{ "limit": dto.limit, "offset": dto.offset, "taskId": dto.taskId }) tasks = self.tasksRepository.getCompletedTask(dao) self.manager.send(RES_SOCKET_TASK_FETCH_COMPLETED_TASK, tasks.dict(), webSocket)
class FactorMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("FactorMongoDataSource") async def getFactor(self, year: str = "*", month: str = "*", code: str = "*") -> list: try: findObj: Dict[str, Any] = {} self.mergeFindObj(findObj, "dataYear", str(float(year))) self.mergeFindObj(findObj, "dataMonth", month) self.mergeFindObj(findObj, "code", code) self.logger.info("getFactor", str(findObj)) cursor = self.factor.find(findObj) fields = [ "code", "dataMonth", "dataName", "dataYear", "dataId", "dataValue", "name" ] return list( map( lambda data: FactorData( **{field: data[field] for field in fields}), list(cursor))) except Exception: self.logger.error("getFactor", traceback.format_exc()) return list() async def insertFactor(self, li: List[FactorDao]) -> None: try: if not self.isSetupMarcap(): self.setupMarcap() for one in li: data = one.dict() data["updatedAt"] = getNow() await asyncio.create_task(self.insertFactorOne(data)) except Exception: self.logger.error("insertFactor", traceback.format_exc()) async def insertFactorOne(self, data: Dict) -> None: self.factor.update_one( { "code": data["code"], "dataYear": data["dataYear"], "dataMonth": data["dataMonth"], "dataName": data["dataName"], }, { "$set": data, "$setOnInsert": { "createdAt": getNow() } }, upsert=True) def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "success"}, {"state": "fail"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({ "$or": [{ "state": "success" }, { "state": "fail" }] }).count() res = ListLimitResponse( **{ "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getFactor", traceback.format_exc()) return []
class TaskRunner(object): def __init__(self) -> None: super().__init__() self.logger = Logger("TaskRunner") self.queue: asyncio.Queue = asyncio.Queue() self.loop = asyncio.get_running_loop() self.pool = TaskPool(notifyCallback=self.notifyRmOnPool) self.notifyCallback = None # self.loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() def getPoolInfo(self) -> TaskPoolInfo: return TaskPoolInfo( **{ "poolSize": self.pool.poolSize, "poolCount": self.pool.poolCount(), "runCount": self.pool.runCount(), "queueCount": self.queue.qsize() }) def updatePoolInfo(self) -> None: self.logger.info( "updatePoolInfo", f"runCount:{self.pool.runCount()}, queueCount:{self.queue.qsize()}" ) if self.notifyCallback: self.notifyCallback( TaskPoolInfo( **{ "poolSize": self.pool.poolSize, "poolCount": self.pool.poolCount(), "runCount": self.pool.runCount(), "queueCount": self.queue.qsize() })) def notifyPutOnQueue(self) -> None: self.loop.create_task(self.notifyToPool()) def notifyRmOnPool(self) -> None: if self.queue.qsize() > 0: self.loop.create_task(self.notifyToPool()) else: self.updatePoolInfo() def cancel(self, id: str) -> None: pool: Optional[Pool] = self.pool.findPool(id) if pool is not None: self.logger.info("cancel", id) pool.cancel() self.pool.removeTaskPool(pool) else: self.logger.info("cancel", "pool is not exist") def isExist(self, id: str) -> bool: return self.pool.findPool(id) is not None async def notifyToPool(self) -> None: try: if self.queue.qsize() > 0 and (self.pool.poolSize - self.pool.poolCount()) > 0: pool = self.pool.addTaskPool(Pool(), False) # timeout이 있으면 nonblocking으로 움직임 task: Task = await asyncio.wait_for(self.queue.get(), timeout=1) if task: pool.setTask(task) pool.run(self.pool) else: self.pool.removeTaskPool(pool, False) # if self.pool.poolSize > self.queue.qsize() and self.pool.poolCount() >= self.queue.qsize(): # print("exit") # elif self.pool.poolSize > self.pool.poolCount() and self.queue.qsize() > 0: # pool = self.pool.addTaskPool(Pool(), False) # print(f"before qsize:{self.queue.qsize()}") # task: Task = await asyncio.wait_for(self.queue.get(), timeout=1) # print(f"after qsize:{self.queue.qsize()}") # if task: # pool.setTask(task) # pool.run(self.pool) # else: # self.pool.removeTaskPool(pool, False) except asyncio.TimeoutError as e: self.logger.info("notifyToPool", f"timeout:{str(e)}") self.pool.removeTaskPool(pool, False) finally: self.updatePoolInfo() def put(self, task: Task) -> None: task.loop = self.loop self.loop.create_task(self._put(task)) async def _put(self, task: Task) -> None: self.logger.info("_put", "task put") await self.queue.put(task) self.notifyPutOnQueue()
class TasksRepository(object): def __init__(self, mongod: TaskMongoDataSource) -> None: super().__init__() self.mongod = mongod self.logger = Logger("TasksRepository") self.taskEventEmitter = EventEmitter() self.tasksdto = ProcessTasks() self.taskRunner: Optional[TaskRunner] = None self.createTaskRunner() # 태스크 러너를 만든다. def createTaskRunner(self) -> None: if self.taskRunner is None: self.taskRunner = TaskRunner() self.taskRunner.notifyCallback = self.onUpdatePoolInfo self.logger.info("createTaskRunner", "created taskrunner") # 태스크 풀 정보가 업데이트 될 떄 이벤트를 날린다. def onUpdatePoolInfo(self, poolInfo: TaskPoolInfo) -> None: self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_POOL_INFO, poolInfo) self.logger.info("updatePoolInfo", f"{poolInfo.json()}") # 테스크 풀 정보를 가져온다. def getPoolInfo(self) -> None: if self.taskRunner: poolInfo = self.taskRunner.getPoolInfo() self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_POOL_INFO, poolInfo) # 태스크 풀에 태스크를 등록한다. def runTask(self, task: Task) -> None: # print("runTask") if self.taskRunner: self.taskRunner.put(task) # 추가된 태스크 정보를 저장한다. def addTask(self, task: ProcessTask) -> None: if task.taskId not in self.tasksdto.tasks: self.tasksdto.tasks[task.taskId] = dict() self.tasksdto.tasks[task.taskId]["list"] = dict() self.tasksdto.tasks[task.taskId]["ids"] = [] self.tasksdto.taskIds.append(task.taskId) self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] = task self.tasksdto.tasks[task.taskId]["ids"].append(task.taskUniqueId) self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto) self.logger.info("addTask", f"{task.taskUniqueId}") # 갱신 태스크 정보를 저장한다. def updateTask(self, task: ProcessTask) -> None: self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] = task self.logger.info("updateTask", f"{task.taskUniqueId}") self.mongod.upsertTask(task.dict()) self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto) def updateAllTask(self) -> None: self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto) # 저장된 테스크 정보를 반환한다. def getTask(self, taskId: str, taskUniqueId: str) -> ProcessTask: if self.isExistTask(taskId, taskUniqueId): return self.tasksdto.tasks[taskId]["list"][taskUniqueId] return None # 저장된 태스크가 있는지 확인한다. def isExistTask(self, taskId: str, taskUniqueId: str) -> bool: return taskId in self.tasksdto.tasks and taskUniqueId in self.tasksdto.tasks[ taskId]["list"] # 저장된 태스크 정보를 삭제한다. def deleteTask(self, task: ProcessTask) -> None: if task.taskId in self.tasksdto.tasks: if task.taskUniqueId in self.tasksdto.tasks[task.taskId]["list"]: del self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] self.tasksdto.tasks[task.taskId]["ids"].remove( task.taskUniqueId) self.logger.info("deleteTask", f"{task.taskUniqueId}") def errorTask(self, dto: TaskModel, errMsg: str) -> None: task = self.getTask(dto.taskId, dto.taskUniqueId) task.state = "error" task.errMsg = errMsg self.updateTask(task) def completeFactorConvertFileToDbTask(self, task: ProcessTask) -> None: self.success(task, 1) self.updateTask(task) self.deleteTask(task) self.taskEventEmitter.emit(EVENT_TASK_REPO_TASK_COMPLETE, "factorFile", None) def completeFactorDart(self, task: ProcessTask, year: int) -> None: self.success(task, 1) self.updateTask(task) if task.restCount <= 0: self.deleteTask(task) task.state = "complete" self.updateTask(task) self.logger.info("completeFactorDart", "complete") self.taskEventEmitter.emit( EVENT_TASK_REPO_TASK_COMPLETE, "factorDart", StockUpdateState( **{ "taskId": task.taskId, "market": task.market, "date": year, "ret": 1 })) # 완료된 태스크 정보를 처린한다. def completeStockCrawlingTask(self, isSuccess: bool, retdto: StockMarketCapitalResult, dto: StockCrawlingDownloadTask) -> None: self.logger.info("##############completeStockCrawlingTask", str(isSuccess)) task = self.getTask(dto.taskId, dto.taskUniqueId) if isSuccess: self.success(task, 1) else: self.fail(task, 1) if task.restCount <= 0: self.deleteTask(task) if retdto: task.errMsg = retdto.errorMsg task.state = "success" self.updateTask(task) self.logger.info("completeStockCrawlingTask", "complete") self.taskEventEmitter.emit( EVENT_TASK_REPO_TASK_COMPLETE, "marcap", StockUpdateState( **{ "taskId": dto.taskId, "market": dto.market, "date": dto.dateStr, "ret": 1 if isSuccess else 2 })) # 성공한 태스크 정보를 처리한다. def success(self, task: ProcessTask, count: int) -> None: task.successCount = task.successCount + count task.restCount = task.restCount - count i = 0 for _ in range(count): task.tasksRet[task.index + i] = SUCCESS i = i + 1 task.index = task.index + count task.percent = (task.successCount + task.failCount) / task.count * 100 if task.restCount <= 0: task.state = "success" else: task.state = "waiting next task" self.logger.info("success", f"{task.taskUniqueId}") # 실패한 태스크 정보를 처리한다. def fail(self, task: ProcessTask, count: int) -> None: task.failCount = task.failCount + count task.restCount = task.restCount - count i = 0 for _ in range(count): left = task.tasks[task.index + i] task.failTasks.append(left) task.tasksRet[task.index + i] = FAIL i = i + 1 task.index = task.index + count task.percent = (task.successCount + task.failCount) / task.count * 100 if task.restCount <= 0: task.state = "fail" else: task.state = "waiting next task" self.logger.info("fail", f"{task.taskUniqueId}") # 완료된 태스크 정보를 반환한다. def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao: taskData = self.mongod.getCompletedTask(dto) print(taskData) tasks: Dict = dict() taskIds = [] for task in taskData.data: if task["taskId"] not in tasks: tasks[task["taskId"]] = dict() tasks[task["taskId"]]["list"] = dict() tasks[task["taskId"]]["ids"] = [] taskIds.append(task["taskId"]) tasks[task["taskId"]]["list"][task["taskUniqueId"]] = task tasks[task["taskId"]]["ids"].append(task["taskUniqueId"]) stockCrawlingCompletedTasksDTO = StockCrawlingCompletedTasks( **{ "history": tasks, "historyIds": taskIds }) taskData.data = stockCrawlingCompletedTasksDTO self.logger.info("getCompletedTask", f"count: {len(taskIds)}") return taskData # 모든 태스크 상태를 반환한다. def getAllTaskState(self, taskId: str) -> StockTaskState: markets = ["kospi", "kosdaq"] resultDict: YearData = YearData(**{"yearData": dict()}) resultDict.yearData[taskId] = dict() for market in markets: data = self.mongod.getAllTaskState(taskId, market) compDict: Dict = {} count: Dict = {} for one in data: for idx, taskDate in enumerate(one["tasks"]): if taskDate in compDict.keys(): if compDict[taskDate]["ret"] == 1 or one["tasksRet"][ idx] == 1: compDict[taskDate] = {"date": taskDate, "ret": 1} else: year = taskDate[0:4] if year in count.keys(): count[year] = count[year] + 1 else: count[year] = 1 compDict[taskDate] = { "date": taskDate, "ret": one["tasksRet"][idx] } collect: List = list(compDict.values()) collect = sorted(collect, key=lambda x: x["date"]) resultDict.yearData[taskId][market] = StockTaskState( **{ "taskStates": compDict, "taskKeys": compDict.keys(), "stocks": collect, "years": count, "market": market, "taskId": taskId }) return resultDict