Exemplo n.º 1
0
 def __init__(self, id: str, func: Callable, param: Any = {}) -> None:
     super().__init__()
     self.id = id
     self.func = func
     self.param = param
     self.logger = Logger("Task")
     self.loop: Optional[AbstractEventLoop] = None
Exemplo n.º 2
0
 def __init__(self, stockRepository: StockRepository,
              tasksRepository: TasksRepository,
              crawlerRepository: CrawlerRepository) -> None:
     self.stockRepository = stockRepository
     self.tasksRepository = tasksRepository
     self.crawlerRepository = crawlerRepository
     self.logger = Logger("StockService")
Exemplo n.º 3
0
 def __init__(self) -> None:
     super().__init__()
     self.logger = Logger("TaskRunner")
     self.queue: asyncio.Queue = asyncio.Queue()
     self.loop = asyncio.get_running_loop()
     self.pool = TaskPool(notifyCallback=self.notifyRmOnPool)
     self.notifyCallback = None
Exemplo n.º 4
0
 def __init__(self, mongod: TaskMongoDataSource) -> None:
     super().__init__()
     self.mongod = mongod
     self.logger = Logger("TasksRepository")
     self.taskEventEmitter = EventEmitter()
     self.tasksdto = ProcessTasks()
     self.taskRunner: Optional[TaskRunner] = None
     self.createTaskRunner()
Exemplo n.º 5
0
 def __init__(self, manager: ConnectionManager,
              factorRepository: FactorRepository,
              tasksRepository: TasksRepository,
              crawlerRepository: CrawlerRepository,
              taskService: 'TaskService') -> None:
     self.manager = manager
     self.factorRepository = factorRepository
     self.tasksRepository = tasksRepository
     self.crawlerRepository = crawlerRepository
     self.taskService = taskService
     self.logger = Logger("FactorService")
Exemplo n.º 6
0
class Task(object):
    def __init__(self, id: str, func: Callable, param: Any = {}) -> None:
        super().__init__()
        self.id = id
        self.func = func
        self.param = param
        self.logger = Logger("Task")
        self.loop: Optional[AbstractEventLoop] = None

    async def run(self, taskPool: TaskPool, pool: Pool) -> None:
        self.logger.info("run", "task run")
        if self.loop:
            self.param["taskPool"] = taskPool
            self.param["pool"] = pool
            await self.loop.create_task(self.func(**self.param))
Exemplo n.º 7
0
 def __init__(self, mongod: StockMongoDataSource,
              tasksRepository: TasksRepository) -> None:
     super().__init__()
     self.mongod = mongod
     self.tasksRepository = tasksRepository
     self.logger = Logger("StockRepository")
     self.ee = EventEmitter()
Exemplo n.º 8
0
 def __init__(self, factorMongod: FactorMongoDataSource,
              factorDartMongod: FactorDartMongoDataSource,
              filed: FactorFileDataSource) -> None:
     super().__init__()
     self.factorMongod = factorMongod
     self.factorDartMongod = factorDartMongod
     self.filed = filed
     self.logger = Logger("FactorRepository")
Exemplo n.º 9
0
 def __init__(
         self,
         manager: ConnectionManager,
         tasksRepository: TasksRepository,
         taskScheduler: TaskScheduler,
         factorService: FactorService,
         stockService: StockService,
         crawlerRepository: CrawlerRepository
         ) -> None:
     self.tasksRepository = tasksRepository
     self.crawlerRepository = crawlerRepository
     self.manager = manager
     self.taskScheduler = taskScheduler
     self.factorService = factorService
     self.stockService = stockService
     self.logger = Logger("TaskService")
     self.ee = self.tasksRepository.taskEventEmitter
     self.setupEvents()
Exemplo n.º 10
0
 def marcapJob(marcapDtos: List[StockRunCrawling]) -> None:
     service: StockService = Locator.getInstance().get(StockService)
     logger = Logger("TaskService_marcapJob")
     for dto in marcapDtos:
         logger.info("#### schedule job start ####")
         logger.info("command" + dto.startDateStr + "~" + dto.endDateStr)
         dto.taskUniqueId = dto.taskId + dto.market+dto.startDateStr + dto.endDateStr + str(uuid.uuid4())
         if dto.isNow:
             dto.startDateStr = getNowDateStr()
             dto.endDateStr = getNowDateStr()
         logger.info("real:" + dto.startDateStr + "~" + dto.endDateStr)
     service.crawlingMarcapStockData(marcapDtos)
Exemplo n.º 11
0
class Pool(object):
    def __init__(self) -> None:
        super().__init__()
        self.isRun = False
        self.logger = Logger("Pool")
        self.task: Optional[Task] = None
        self.taskId = ""

    def setTask(self, task: Task) -> None:
        self.task = task
        self.taskId = task.id

    def run(self, taskPool: TaskPool) -> None:
        self.isRun = True
        self.logger.info("run", "task pool run")
        if self.task is not None:
            self.poolTask = asyncio.ensure_future(self.task.run(
                taskPool, self))

    def cancel(self) -> None:
        self.isRun = False
        self.logger.info("cancel", "task pool cancel")
        if self.poolTask and not self.poolTask.cancelled():
            self.poolTask.cancel()
Exemplo n.º 12
0
class MarcapCrawler(object):
    
    def __init__(self) -> None:
        super().__init__()
        self.ee = EventEmitter()
        self.logger = Logger("MarcapCrawler")

    def createUUID(self) -> str:
        return str(uuid.uuid4())

    async def connectWebDriver(self, addr: str, uuid: str) -> WebDriver:
        chrome_options = webdriver.ChromeOptions()
        prefs = {
            'profile.default_content_setting_values.automatic_downloads': 1,
            'download.default_directory': f"/home/seluser/Downloads/{uuid}"
        }
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Remote(
            command_executor=addr,
            options=chrome_options,

        )
        driver.set_page_load_timeout(60)
        driver.set_script_timeout(60)
        self.logger.info("connectWebDriver", "create driver")
        return driver

    def connectLocalDriver(self, addr: str, uuid: str) -> WebDriver:
        chrome_options = webdriver.ChromeOptions()
        prefs = {
            'profile.default_content_setting_values.automatic_downloads': 1,
            'download.default_directory': f"/Users/iseongjae/Documents/PersonalProjects/fin-web/fin-crawling-server/server/downloads/{uuid}"
        }
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome(executable_path="/Users/iseongjae/Downloads/chromedriver", chrome_options=chrome_options)
        return driver

    async def crawling(self, dto: StockRunCrawling) -> None:
        driver = None
        downloadObserver = None
        try:
            uuid = self.createUUID()
            self.logger.info("crawling", uuid)
            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, dto)
            
            downloadObserver = DownloadObserver()
            path = await asyncRetryNonBlock(5, 1, downloadObserver.makePath, uuid)
            downloadObserver.startObserver(path, self.ee)
            self.logger.info("crawling", "create observer and start")
            print("startObserver")

            driver = await asyncRetryNonBlock(5, 1, self.connectWebDriver, dto.driverAddr, uuid)
            print("connectWebDriver")
            driver.get("http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020101")
            try:
                alert = WebDriverWait(driver, timeout=3).until(EC.alert_is_present())
                alert.accept()
            except Exception as e:
                print("예외발생:"+str(e))
            print("start:"+dto.startDateStr)

            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, dto)
            WebDriverWait(driver, timeout=20, poll_frequency=1).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mktId_0_1")))
            date = datetime.strptime(dto.startDateStr, "%Y%m%d")
            endDate = datetime.strptime(dto.endDateStr, "%Y%m%d")

            while date <= endDate:
                dateStr = date.strftime("%Y%m%d")
                downloadTask = StockCrawlingDownloadTask(**{
                    "dateStr": dateStr,
                    "market": dto.market,
                    "uuid": uuid,
                    "taskId": dto.taskId,
                    "taskUniqueId": dto.taskUniqueId
                })
                self.logger.info("crawling", f"create downloadTask taskId: {dto.taskId} market: {dto.market} date: {dateStr} taskUniqueId: {dto.taskUniqueId}")
                print(downloadTask.json())
                downloadObserver.event_handler.setDownloadTask(downloadTask)
                self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, downloadTask)
                await asyncRetryNonBlock(5, 1, self.downloadData, downloadTask, downloadObserver, driver)
                # await self.downloadData(downloadTask, downloadObserver, driver)
                date = date + timedelta(days=1)
        except Exception as e:
            raise e
        finally:
            if downloadObserver:
                downloadObserver.stopObserver()
            if driver:
                driver.quit()
    
    async def downloadData(self, downloadTask: StockCrawlingDownloadTask, downloadObserver: DownloadObserver, driver: WebDriver) -> None:
        self.logger.info("downloadData")
        if driver is None:
            return
        # pymitter
        before = driver.execute_script("return $('.CI-MDI-UNIT-TIME').text()")
        if downloadTask.market == "kospi":
            driver.execute_script('$("#mktId_0_1").click()')
        elif downloadTask.market == "kosdaq":
            driver.execute_script('$("#mktId_0_2").click()')
        elif downloadTask.market == "konex":
            driver.execute_script('$("#mktId_0_3").click()')
        #     driver.implicitly_wait(1)
        driver.execute_script(f'$("#trdDd")[0].value = "{downloadTask.dateStr}"')
        #     driver.implicitly_wait(1)
        driver.execute_script('$(".btn_component_search").click()')
        #     driver.implicitly_wait(1)
        after = before
        while before == after:
            after = driver.execute_script('return $(".CI-MDI-UNIT-TIME").text()')
            await sleepNonBlock(0.5)
        #     driver.implicitly_wait(1)
        print("before:"+before)
        print("after:"+after)
        await sleepNonBlock(3)
        WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[class='CI-MDI-UNIT-DOWNLOAD']")))
        driver.execute_script("$('[class=\"CI-MDI-UNIT-DOWNLOAD\"]').click()")
        WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[data-type='csv']")))
        driver.execute_script("$(\"[data-type='csv']\").click()")
        print("wait:"+downloadTask.dateStr)

        loop = asyncio.get_running_loop()
        queue: asyncio.Queue = asyncio.Queue(maxsize=1, loop=loop)

        async def fileResultOfData(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
            result = {}
            result["event"] = event
            result["downloadTask"] = downloadTask
            await queue.put(result)

        @self.ee.once(FILE_SYSTEM_HANDLER(downloadTask.uuid))
        def downloadComplete(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
            loop.create_task(fileResultOfData(event, downloadTask))
            
        try:
            result = await asyncio.wait_for(queue.get(), timeout=30)
            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, downloadTask)
            await asyncio.create_task(self.makeMarcapData(result["event"], result["downloadTask"]))
        except Exception as e:
            raise e
        finally:
            queue.task_done()

    def convertFileToDto(self, path: str, dto: StockMarketCapitalResult) -> None:
        lines = []
        with open(path, "r", encoding="utf-8") as f:
            # p = Path(f.name)
            # dto.date = p.stem
            lines = f.readlines()
        
        for i in range(1, len(lines)):
            data = lines[i].replace('"', '').split(",")
            if dto.market == "kospi":
                marcap = StockMarketCapital(**{
                    "date": dto.date,
                    "market": dto.market,
                    "code": data[0].strip(),
                    "name": data[1].strip(),
                    "close": data[2].strip(),
                    "diff": data[3].strip(),
                    "percent": data[4].strip(),
                    "open": data[5].strip(),
                    "high": data[6].strip(),
                    "low": data[7].strip(),
                    "volume": data[8].strip(),
                    "price": data[9].strip(),
                    "marcap": data[10].strip(),
                    "number": data[11].strip()
                })
            else:
                marcap = StockMarketCapital(**{
                    "date": dto.date,
                    "market": dto.market,
                    "code": data[0].strip(),
                    "name": data[1].strip(),
                    "close": data[3].strip(),
                    "diff": data[4].strip(),
                    "percent": data[5].strip(),
                    "open": data[6].strip(),
                    "high": data[7].strip(),
                    "low": data[8].strip(),
                    "volume": data[9].strip(),
                    "price": data[10].strip(),
                    "marcap": data[11].strip(),
                    "number": data[12].strip()
                })
            # print("append marcap: " + str(marcap))
            
            dto.data.append(marcap)

    async def isExistFile(self, path: str, ext: str = ".csv") -> bool:
        isExist = path.endswith(ext)
        restTimes = 3
        while not isExist and restTimes >= 0:
            await sleepNonBlock(1)
            isExist = path.endswith(ext)
            restTimes -= 1
        return isExist
    
    async def parseReceivedFile(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
        retdto = StockMarketCapitalResult()
        date = downloadTask.dateStr
        market = downloadTask.market
        retdto.date = date
        retdto.market = market
        isExist = await self.isExistFile(event.src_path)
        if not isExist:
            return
        print("created: " + date)
        await sleepNonBlock(0.5)
        dest_path = f'{os.path.dirname(event.src_path)}/{market+"-"+date}.csv'
        if os.path.isfile(dest_path):
            return
        self.changeCharSet(event.src_path)
        os.rename(event.src_path, dest_path)
        self.convertFileToDto(dest_path, retdto)
        retdto.result = "success"
        self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, True, retdto, downloadTask)
        self.ee.emit(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, downloadTask, retdto)
        self.logger.info("parseFile", f"success, {downloadTask.taskUniqueId}")
    
    async def makeMarcapData(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
        try:
            await asyncRetry(3, 1, self.parseReceivedFile, event, downloadTask)
        except Exception:
            retdto = StockMarketCapitalResult()
            retdto.result = "fail"
            retdto.errorMsg = traceback.format_exc()
            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, False, retdto, downloadTask)
            self.logger.error("parseFile", f"fail, {downloadTask.taskUniqueId} error: {traceback.format_exc()}")
        finally:
            self.logger.info("parseFile...")

    def changeCharSet(self, path: str) -> None:
        lines = None
        with open(path, "r", encoding="euc-kr") as f:
            lines = f.readlines()
        with open(path, 'w', encoding="utf-8") as f:
            f.writelines(lines)
Exemplo n.º 13
0
class TasksRepository(object):
    def __init__(self, mongod: TaskMongoDataSource) -> None:
        super().__init__()
        self.mongod = mongod
        self.logger = Logger("TasksRepository")
        self.taskEventEmitter = EventEmitter()
        self.tasksdto = ProcessTasks()
        self.taskRunner: Optional[TaskRunner] = None
        self.createTaskRunner()

    # 태스크 러너를 만든다.
    def createTaskRunner(self) -> None:
        if self.taskRunner is None:
            self.taskRunner = TaskRunner()
            self.taskRunner.notifyCallback = self.onUpdatePoolInfo
            self.logger.info("createTaskRunner", "created taskrunner")

    # 태스크 풀 정보가 업데이트 될 떄 이벤트를 날린다.
    def onUpdatePoolInfo(self, poolInfo: TaskPoolInfo) -> None:
        self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_POOL_INFO, poolInfo)
        self.logger.info("updatePoolInfo", f"{poolInfo.json()}")

    # 테스크 풀 정보를 가져온다.
    def getPoolInfo(self) -> None:
        if self.taskRunner:
            poolInfo = self.taskRunner.getPoolInfo()
            self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_POOL_INFO,
                                       poolInfo)

    # 태스크 풀에 태스크를 등록한다.
    def runTask(self, task: Task) -> None:
        # print("runTask")
        if self.taskRunner:
            self.taskRunner.put(task)

    # 추가된 태스크 정보를 저장한다.
    def addTask(self, task: ProcessTask) -> None:
        if task.taskId not in self.tasksdto.tasks:
            self.tasksdto.tasks[task.taskId] = dict()
            self.tasksdto.tasks[task.taskId]["list"] = dict()
            self.tasksdto.tasks[task.taskId]["ids"] = []
            self.tasksdto.taskIds.append(task.taskId)

        self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] = task
        self.tasksdto.tasks[task.taskId]["ids"].append(task.taskUniqueId)
        self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto)
        self.logger.info("addTask", f"{task.taskUniqueId}")

    # 갱신 태스크 정보를 저장한다.
    def updateTask(self, task: ProcessTask) -> None:
        self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] = task
        self.logger.info("updateTask", f"{task.taskUniqueId}")
        self.mongod.upsertTask(task.dict())
        self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto)

    def updateAllTask(self) -> None:
        self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto)

    # 저장된 테스크 정보를 반환한다.
    def getTask(self, taskId: str, taskUniqueId: str) -> ProcessTask:
        if self.isExistTask(taskId, taskUniqueId):
            return self.tasksdto.tasks[taskId]["list"][taskUniqueId]
        return None

    # 저장된 태스크가 있는지 확인한다.
    def isExistTask(self, taskId: str, taskUniqueId: str) -> bool:
        return taskId in self.tasksdto.tasks and taskUniqueId in self.tasksdto.tasks[
            taskId]["list"]

    # 저장된 태스크 정보를 삭제한다.
    def deleteTask(self, task: ProcessTask) -> None:
        if task.taskId in self.tasksdto.tasks:
            if task.taskUniqueId in self.tasksdto.tasks[task.taskId]["list"]:
                del self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId]
                self.tasksdto.tasks[task.taskId]["ids"].remove(
                    task.taskUniqueId)
                self.logger.info("deleteTask", f"{task.taskUniqueId}")

    def errorTask(self, dto: TaskModel, errMsg: str) -> None:
        task = self.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "error"
        task.errMsg = errMsg
        self.updateTask(task)

    def completeFactorConvertFileToDbTask(self, task: ProcessTask) -> None:
        self.success(task, 1)
        self.updateTask(task)
        self.deleteTask(task)
        self.taskEventEmitter.emit(EVENT_TASK_REPO_TASK_COMPLETE, "factorFile",
                                   None)

    def completeFactorDart(self, task: ProcessTask, year: int) -> None:
        self.success(task, 1)
        self.updateTask(task)
        if task.restCount <= 0:
            self.deleteTask(task)
        task.state = "complete"
        self.updateTask(task)
        self.logger.info("completeFactorDart", "complete")
        self.taskEventEmitter.emit(
            EVENT_TASK_REPO_TASK_COMPLETE, "factorDart",
            StockUpdateState(
                **{
                    "taskId": task.taskId,
                    "market": task.market,
                    "date": year,
                    "ret": 1
                }))

    # 완료된 태스크 정보를 처린한다.
    def completeStockCrawlingTask(self, isSuccess: bool,
                                  retdto: StockMarketCapitalResult,
                                  dto: StockCrawlingDownloadTask) -> None:
        self.logger.info("##############completeStockCrawlingTask",
                         str(isSuccess))
        task = self.getTask(dto.taskId, dto.taskUniqueId)
        if isSuccess:
            self.success(task, 1)
        else:
            self.fail(task, 1)
        if task.restCount <= 0:
            self.deleteTask(task)
        if retdto:
            task.errMsg = retdto.errorMsg
        task.state = "success"
        self.updateTask(task)
        self.logger.info("completeStockCrawlingTask", "complete")
        self.taskEventEmitter.emit(
            EVENT_TASK_REPO_TASK_COMPLETE, "marcap",
            StockUpdateState(
                **{
                    "taskId": dto.taskId,
                    "market": dto.market,
                    "date": dto.dateStr,
                    "ret": 1 if isSuccess else 2
                }))

    # 성공한 태스크 정보를 처리한다.
    def success(self, task: ProcessTask, count: int) -> None:
        task.successCount = task.successCount + count
        task.restCount = task.restCount - count
        i = 0
        for _ in range(count):
            task.tasksRet[task.index + i] = SUCCESS
            i = i + 1
        task.index = task.index + count
        task.percent = (task.successCount + task.failCount) / task.count * 100
        if task.restCount <= 0:
            task.state = "success"
        else:
            task.state = "waiting next task"
        self.logger.info("success", f"{task.taskUniqueId}")

    # 실패한 태스크 정보를 처리한다.
    def fail(self, task: ProcessTask, count: int) -> None:
        task.failCount = task.failCount + count
        task.restCount = task.restCount - count
        i = 0
        for _ in range(count):
            left = task.tasks[task.index + i]
            task.failTasks.append(left)
            task.tasksRet[task.index + i] = FAIL
            i = i + 1
        task.index = task.index + count
        task.percent = (task.successCount + task.failCount) / task.count * 100
        if task.restCount <= 0:
            task.state = "fail"
        else:
            task.state = "waiting next task"
        self.logger.info("fail", f"{task.taskUniqueId}")

    # 완료된 태스크 정보를 반환한다.
    def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao:
        taskData = self.mongod.getCompletedTask(dto)
        print(taskData)
        tasks: Dict = dict()
        taskIds = []
        for task in taskData.data:
            if task["taskId"] not in tasks:
                tasks[task["taskId"]] = dict()
                tasks[task["taskId"]]["list"] = dict()
                tasks[task["taskId"]]["ids"] = []
                taskIds.append(task["taskId"])
            tasks[task["taskId"]]["list"][task["taskUniqueId"]] = task
            tasks[task["taskId"]]["ids"].append(task["taskUniqueId"])

        stockCrawlingCompletedTasksDTO = StockCrawlingCompletedTasks(
            **{
                "history": tasks,
                "historyIds": taskIds
            })
        taskData.data = stockCrawlingCompletedTasksDTO
        self.logger.info("getCompletedTask", f"count: {len(taskIds)}")
        return taskData

    # 모든 태스크 상태를 반환한다.
    def getAllTaskState(self, taskId: str) -> StockTaskState:
        markets = ["kospi", "kosdaq"]
        resultDict: YearData = YearData(**{"yearData": dict()})
        resultDict.yearData[taskId] = dict()
        for market in markets:
            data = self.mongod.getAllTaskState(taskId, market)
            compDict: Dict = {}
            count: Dict = {}
            for one in data:
                for idx, taskDate in enumerate(one["tasks"]):
                    if taskDate in compDict.keys():
                        if compDict[taskDate]["ret"] == 1 or one["tasksRet"][
                                idx] == 1:
                            compDict[taskDate] = {"date": taskDate, "ret": 1}
                    else:
                        year = taskDate[0:4]
                        if year in count.keys():
                            count[year] = count[year] + 1
                        else:
                            count[year] = 1
                        compDict[taskDate] = {
                            "date": taskDate,
                            "ret": one["tasksRet"][idx]
                        }
            collect: List = list(compDict.values())
            collect = sorted(collect, key=lambda x: x["date"])
            resultDict.yearData[taskId][market] = StockTaskState(
                **{
                    "taskStates": compDict,
                    "taskKeys": compDict.keys(),
                    "stocks": collect,
                    "years": count,
                    "market": market,
                    "taskId": taskId
                })
        return resultDict
Exemplo n.º 14
0
class StockService:
    def __init__(self, stockRepository: StockRepository,
                 tasksRepository: TasksRepository,
                 crawlerRepository: CrawlerRepository) -> None:
        self.stockRepository = stockRepository
        self.tasksRepository = tasksRepository
        self.crawlerRepository = crawlerRepository
        self.logger = Logger("StockService")

    async def getStockData(self, market: str, startDate: str,
                           endDate: str) -> List[StockMarketCapital]:
        return await self.stockRepository.getStockData(market, startDate,
                                                       endDate)

    def crawlingMarcapStockData(self, dtoList: List[StockRunCrawling]) -> None:
        self.logger.info("crawlingMarcapStockData", str(len(dtoList)))
        for dto in dtoList:
            if dto.taskId == "marcap":

                async def marcapTaskWorker(runDto: StockRunCrawling,
                                           pool: Pool,
                                           taskPool: TaskPool) -> None:
                    try:
                        self.logger.info("runCrawling&marcapTaskWorker",
                                         "start")
                        marcapCrawler = MarcapCrawler()
                        taskUniqueId = runDto.taskUniqueId
                        self.crawlerRepository.addCrawler(
                            taskUniqueId, marcapCrawler)
                        self.createListners(marcapCrawler.ee)
                        self.logger.info("runCrawling&marcapTaskWorker",
                                         f"taskWorker:{taskUniqueId}")
                        await marcapCrawler.crawling(runDto)
                        taskPool.removeTaskPool(pool)
                        self.crawlerRepository.removeCrawler(taskUniqueId)
                    except asyncio.CancelledError:
                        self.logger.info("convertFactorFileToDbTask", "cancel")
                    except Exception:
                        self.logger.error("convertFactorFileToDbTask",
                                          f"error: {traceback.format_exc()}")
                        self.tasksRepository.errorTask(runDto,
                                                       traceback.format_exc())

                workerTask = Task(dto.taskUniqueId, marcapTaskWorker,
                                  {"runDto": dto})
                if self.tasksRepository.taskRunner:
                    if self.tasksRepository.isExistTask(
                            dto.taskId, dto.taskUniqueId):
                        return
                    startDate = datetime.strptime(dto.startDateStr, "%Y%m%d")
                    endDate = datetime.strptime(dto.endDateStr, "%Y%m%d")
                    taskDates = [
                        (startDate + timedelta(days=x)).strftime("%Y%m%d")
                        for x in range((endDate - startDate).days + 1)
                    ]
                    task = ProcessTask(
                        **{
                            "market": dto.market,
                            "startDateStr": dto.startDateStr,
                            "endDateStr": dto.endDateStr,
                            "taskUniqueId": dto.taskUniqueId,
                            "taskId": dto.taskId,
                            "count": len(taskDates),
                            "tasks": deque(taskDates),
                            "restCount": len(taskDates),
                            "tasksRet": deque(([0] * len(taskDates))),
                        })
                    task.state = "find worker"
                    self.tasksRepository.addTask(task)
                    self.tasksRepository.runTask(workerTask)
                    self.logger.info("runMarcapTask", f"runTask {task.json()}")

    def createListners(self, ee: EventEmitter) -> None:
        ee.on(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA,
              self.onResultOfStockData)

        ee.on(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER,
              self.onConnectingWebDriver)
        ee.on(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, self.onStartCrawling)
        ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, self.onDownloadStart)
        ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE,
              self.onDownloadComplete)
        ee.on(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE,
              self.onParsingComplete)
        ee.on(EVENT_MARCAP_CRAWLING_ON_ERROR, self.onError)
        ee.on(EVENT_MARCAP_CRAWLING_ON_CANCEL, self.onCancelled)

    # 주식 종목 데이터 크롤링 결과값을 db에 저장한다.
    def onResultOfStockData(self, dto: StockCrawlingDownloadTask,
                            retDto: StockMarketCapitalResult) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "insert to database"
        self.tasksRepository.updateTask(task)

        async def completeMarcapTask() -> None:
            await self.stockRepository.insertMarcap(retDto)
            self.tasksRepository.completeStockCrawlingTask(True, retDto, dto)

        asyncio.create_task(completeMarcapTask())

    # 크롤링 중 웹드라이버와 연결되었을 때 이벤트
    def onConnectingWebDriver(self, dto: StockRunCrawling) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "connecting webdriver"
        self.tasksRepository.updateTask(task)
        self.logger.info("onConnectingWebDriver", task.taskUniqueId)

    # 크롤링이 시작되었을 떄 이벤트
    def onStartCrawling(self, dto: StockRunCrawling) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "start crawling"
        self.tasksRepository.updateTask(task)
        self.logger.info("onStartCrawling", task.taskUniqueId)

    # 크롤링 데이터 다운로드가 시작되었을 때 이벤트
    def onDownloadStart(self, dto: StockCrawlingDownloadTask) -> None:
        # self.logger.info("onDownloadStart: "+dto.json())
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "download start"
        self.tasksRepository.updateTask(task)
        self.logger.info("onDownloadStart", task.taskUniqueId)

    # 크롤링 데이터 다운로드가 완료되었을 때 이벤트
    def onDownloadComplete(self, dto: StockCrawlingDownloadTask) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "download complete"
        self.tasksRepository.updateTask(task)
        self.logger.info("onDownloadComplete", task.taskUniqueId)

    # 크롤링 데이터 변환이 완료되었을 때 이벤트
    def onParsingComplete(self, isSuccess: bool,
                          retdto: StockMarketCapitalResult,
                          dto: StockCrawlingDownloadTask) -> None:
        self.logger.info("onParsingComplete")
        self.logger.info(f"taskId:{dto.taskId} taskUniqueId{dto.taskUniqueId}")
        tar = self.tasksRepository.tasksdto.tasks[dto.taskId]["list"]
        self.logger.info(f"taskDTO: {tar}")
        if not isSuccess:
            self.tasksRepository.completeStockCrawlingTask(
                isSuccess, retdto, dto)

    # 크롤링이 취소되었을 때 이벤트
    def onCancelled(self, dto: StockRunCrawling) -> None:
        self.logger.info("onCancelled")
        # self.tasksRepository.updateAllTask()
        # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        # self.tasksRepository.fail(task, task.restCount)
        # task.state = "cancelled"
        # self.tasksRepository.updateTask(task)
        # self.logger.info("onCancelled", task.taskUniqueId)

    # 크롤링이 에러가났을 때 이벤트
    def onError(self, dto: StockRunCrawling, errorMsg: str) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        self.tasksRepository.fail(task, task.restCount)
        task.state = "error"
        task.errMsg = errorMsg
        self.tasksRepository.updateTask(task)
        self.logger.error("onError", task.taskUniqueId)
Exemplo n.º 15
0
 def __init__(self) -> None:
     super().__init__()
     self.logger = Logger("FactorDartMongoDataSource")
Exemplo n.º 16
0
class FactorDartMongoDataSource(MongoDataSource):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("FactorDartMongoDataSource")

    async def getFactor(self,
                        year: str = "*",
                        month: str = "*",
                        code: str = "*") -> list:
        try:
            findObj: Dict[str, Any] = {}
            self.mergeFindObj(findObj, "dataYear", year)
            self.mergeFindObj(findObj, "dataMonth", month)
            self.mergeFindObj(findObj, "code", code)
            cursor = self.factorDart.find(findObj)
            fields = [
                "code", "dataMonth", "dataName", "dataYear", "dataId",
                "dataValue", "name"
            ]
            return list(
                map(
                    lambda data: FactorData(
                        **{field: data[field]
                           for field in fields}), list(cursor)))
        except Exception:
            self.logger.error("getFactor", traceback.format_exc())
            return list()

    async def insertFactor(self, li: List[FactorDao]) -> None:
        try:
            if not self.isSetupMarcap():
                self.setupMarcap()
            for one in li:
                data = one.dict()
                data["updatedAt"] = getNow()
                self.factorDart.update_one(
                    {
                        "code": data["code"],
                        "dataYear": data["dataYear"],
                        "dataMonth": data["dataMonth"],
                        "dataName": data["dataName"],
                    }, {
                        "$set": data,
                        "$setOnInsert": {
                            "createdAt": getNow()
                        }
                    },
                    upsert=True)
        except Exception:
            self.logger.error("insertFactor", traceback.format_exc())

    def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse:
        try:
            data = dto.dict()
            cursor = self.task.find({"$or": [
                        {"state": "success"},
                        {"state": "fail"}
                    ]}
                ).sort("createdAt", DESCENDING)\
                .skip(data["offset"])\
                .limit(data["limit"])

            count = self.task.find({
                "$or": [{
                    "state": "success"
                }, {
                    "state": "fail"
                }]
            }).count()

            res = ListLimitResponse(
                **{
                    "count": count,
                    "offset": data["offset"],
                    "limit": data["limit"],
                    "data": self.exceptId(list(cursor))
                })

            return res
        except Exception:
            self.logger.error("getCompletedTask", traceback.format_exc())
        return []
Exemplo n.º 17
0
class TaskMongoDataSource(MongoDataSource):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("TaskMongoDataSource")

    def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao:
        try:
            data = dto.dict()
            cursor = self.task.find({"$or": [
                        {"state": "success"},
                        {"state": "fail"},
                        {"state": "complete"},
                        {"state": "error"},
                        {"state": "cancelled"}
                    ]}
                ).sort("createdAt", DESCENDING)\
                .skip(data["offset"])\
                .limit(data["limit"])

            count = self.task.find({
                "$or": [{
                    "state": "success"
                }, {
                    "state": "fail"
                }, {
                    "state": "complete"
                }, {
                    "state": "error"
                }, {
                    "state": "cancelled"
                }]
            }).count()
            print("res:start")
            res = ListLimitDataDao(
                **{
                    "taskId": data["taskId"],
                    "count": count,
                    "offset": data["offset"],
                    "limit": data["limit"],
                    "data": self.exceptId(list(cursor))
                })
            return res
        except Exception:
            self.logger.error("getCompletedTask", traceback.format_exc())
        return []

    def getAllTaskState(self, taskId: str, market: str) -> list:
        try:
            cursor = self.task.find(
                {
                    "taskId": taskId,
                    "market": market
                    # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}]
                },
                projection=["tasks", "tasksRet"])
            return list(cursor)
        except Exception:
            self.logger.error("getAllTaskState", traceback.format_exc())
        return []

    def upsertTask(self, value: dict) -> None:
        try:
            value["updatedAt"] = getNow()
            self.task.update_one({"taskUniqueId": value["taskUniqueId"]}, {
                "$set": value,
                "$setOnInsert": {
                    "createdAt": getNow()
                }
            },
                                 upsert=True)
        except Exception:
            self.logger.error("upsertTask", traceback.format_exc())
Exemplo n.º 18
0
 def __init__(self) -> None:
     super().__init__()
     self.isRun = False
     self.logger = Logger("Pool")
     self.task: Optional[Task] = None
     self.taskId = ""
Exemplo n.º 19
0
 def __init__(self) -> None:
     super().__init__()
     self.ee = EventEmitter()
     self.isLock = False
     self.isCancelled = False
     self.logger = Logger("DartApiCrawler")
Exemplo n.º 20
0
class DartApiCrawler(object):
    def __init__(self) -> None:
        super().__init__()
        self.ee = EventEmitter()
        self.isLock = False
        self.isCancelled = False
        self.logger = Logger("DartApiCrawler")

    def createUUID(self) -> str:
        return str(uuid.uuid4())

    async def downloadCodes(self, isCodeNew: bool, apiKey: str) -> Dict:
        if "pytest" in sys.modules:
            # savepath = Path('factors/codes.zip')
            loadpath = Path('factors/codes')
            datapath = Path("factors/codes/CORPCODE.xml")
        else:
            # savepath = Path('app/static/factors/codes.zip')
            loadpath = Path('app/static/factors/codes')
            datapath = Path("app/static/factors/codes/CORPCODE.xml")

        if isCodeNew or not os.path.exists(datapath.resolve()):
            # user_agent = UserAgent(cache=False, use_cache_server=True)
            headers = {
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'",
                'accept-language': 'ko'
            }
            params = {"crtfc_key": apiKey}
            url = "https://opendart.fss.or.kr/api/corpCode.xml"
            async with aiohttp.ClientSession() as session:
                async with session.get(url, params=params,
                                       headers=headers) as response:
                    data = await response.read()
                    ZipFile(io.BytesIO(data)).extractall(loadpath.resolve())
        tree = ET.parse(datapath.resolve())
        codes: Dict[str, Any] = {}
        for li in tree.findall("list"):
            el = li.find("stock_code")
            if el is not None:
                stockCode = el.text
                if isinstance(stockCode, str) and len(stockCode) == 6:
                    codeEl = li.find("corp_code")
                    nameEl = li.find("corp_name")
                    if codeEl is not None:
                        codes[stockCode] = {}
                        codes[stockCode]["corp_code"] = codeEl.text
                        if nameEl is not None:
                            codes[stockCode]["corp_name"] = nameEl.text
        return codes

    async def crawling(self, dto: DartApiCrawling) -> None:
        # cpu bound 작업
        try:
            if dto.startYear < 2015:
                dto.startYear = 2015
            self.ee.emit(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, dto)
            codes = await asyncRetryNonBlock(5,
                                             1,
                                             self.downloadCodes,
                                             isCodeNew=dto.isCodeNew,
                                             apiKey=dto.apiKey)
            # codes = self.downloadCodes(dto.isCodeNew, dto.apiKey)
            self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto)
            for year in range(dto.startYear, dto.endYear + 1):
                self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA,
                             dto)
                self.logger.info("crawling", str(len(codes)))
                for code in codes:
                    # newDf = self.getYearDf(dart, code, codes, year)
                    newDf = await asyncRetryNonBlock(5, 1, self.getYearDf,
                                                     dto.apiKey, code, codes,
                                                     year)
                    if self.isCancelled:
                        self.ee.emit(EVENT_DART_API_CRAWLING_ON_CANCEL, dto)
                    if newDf is not None:
                        self.logger.info("crawling", code)
                        self.ee.emit(
                            EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, dto,
                            year, newDf.to_dict("records"))
                    # yearDf = await self.getYearDf(dart, code, codes, year, yearDf)
                self.ee.emit(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, dto,
                             year)
                self.logger.info("crawling", str(year))
        except Exception as e:
            raise e

    async def getYearDf(self, apiKey: str, code: str, codes: Dict,
                        year: int) -> pd.DataFrame:
        self.logger.info("getYearDf", f"crawling: {code}")
        df = None
        try:

            url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json'

            # user_agent = UserAgent(cache=False, use_cache_server=True)
            headers = {
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'",
                'accept-language': 'ko',
            }
            params = {
                'crtfc_key': apiKey,
                'corp_code': codes[code]["corp_code"],
                'bsns_year': year,  # 사업년도
                'reprt_code': "11011",  # "11011": 사업보고서
                'fs_div': "CFS",  # "CFS":연결재무제표, "OFS":재무제표
            }
            connector = aiohttp.TCPConnector(limit=50, force_close=True)
            async with aiohttp.ClientSession(connector=connector) as session:
                timeout = aiohttp.ClientTimeout(total=15)
                # async with session.get(url, params=params, headers=headers) as response:
                async with session.get(url,
                                       params=params,
                                       timeout=timeout,
                                       headers=headers) as response:
                    data = await response.json()
                    if 'list' not in data:
                        return None
                    df = pd.json_normalize(data, 'list')
            # df = dart.finstate_all(code, year)
            # df = await asyncio.create_task(dart.finstate_all(code, year))
            # df = await loop.run_in_executor(self.pool, dart.finstate_all, code, year)
        except Exception as e:
            self.logger.error("getYearDf", traceback.format_exc())
            raise e
        self.logger.info("df", str(df))
        if df is not None:
            df["crawling_year"] = year
            df["crawling_code"] = code
            df["crawling_name"] = codes[code]["corp_name"]
            name = codes[code]["corp_name"]
            self.logger.info("getYearDf",
                             f"{str(year)} {str(code)} {str(name)}")
            return df
            # allCodeDf = pd.concat([allCodeDf, df])
            # return allCodeDf
        return None
Exemplo n.º 21
0
class StockMongoDataSource(MongoDataSource):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("StockMongoDataSource")

    async def insertMarcap(self, li: List[StockMarketCapital]) -> None:
        try:
            if not self.isSetupMarcap():
                self.setupMarcap()
            for one in li:
                asyncio.create_task(self.insertMarpcapOne(one))
        except Exception:
            self.logger.error("insertMarcap", traceback.format_exc())
    
    async def insertMarpcapOne(self, one: StockMarketCapital) -> None:
        try:
            data = one.dict()
            data["updatedAt"] = getNow()
            self.marcap.update_one({
                "code": data["code"],
                "date": data["date"],
                "market": data["market"]
            }, {
                "$set": data,
                "$setOnInsert": {"createdAt": getNow()}
            }, upsert=True)
        except Exception:
            self.logger.error("insertMarpcapOne", traceback.format_exc())
        
    async def getMarcap(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]:
        try:
            if not self.isSetupMarcap():
                self.setupMarcap()
            cursor = self.marcap.find({"$and": [{"date": {"$gte": startDate, "$lte": endDate}}, {"market": market}]})
            return list(map(lambda data: StockMarketCapital(**{
                "date": data["date"],
                "market": data["market"],
                "code": data["code"],
                "name": data["name"],
                "close": data["close"],
                "diff": data["diff"],
                "percent": data["percent"],
                "open": data["open"],
                "high": data["high"],
                "low": data["low"],
                "volume": data["volume"],
                "price": data["price"],
                "marcap": data["marcap"],
                "number": data["number"]
            }), list(cursor)))
        except Exception:
            self.logger.error("getMarcap", traceback.format_exc())
            return list()

    def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse:
        try:
            data = dto.dict()
            cursor = self.task.find({"$or": [
                        {"state": "complete"}, 
                        {"state": "error"},
                        {"state": "cancelled"}
                    ]}
                ).sort("createdAt", DESCENDING)\
                .skip(data["offset"])\
                .limit(data["limit"])
            
            count = self.task.find({"$or": [
                        {"state": "complete"}, 
                        {"state": "error"},
                        {"state": "cancelled"}
                    ]}
                ).count()
            
            res = ListLimitResponse(**{
                "count": count,
                "offset": data["offset"],
                "limit": data["limit"],
                "data": self.exceptId(list(cursor))
            })
            
            return res
        except Exception:
            self.logger.error("getCompletedTask", traceback.format_exc())
        return []
    
    def getAllTaskState(self, taskId: str, market: str) -> list:
        try:
            cursor = self.task.find({
                "taskId": taskId,
                "market": market
                # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}]
            }, projection=["tasks", "tasksRet"])
            return list(cursor)
        except Exception:
            self.logger.error("getAllTaskState", traceback.format_exc())
        return []

    def upsertTask(self, value: dict) -> None:
        try:
            value["updatedAt"] = getNow()
            self.task.update_one({
                "taskUniqueId": value["taskUniqueId"]
            }, {
                "$set": value,
                "$setOnInsert": {"createdAt": getNow()}
            }, upsert=True)
        except Exception:
            self.logger.error("upsertTask", traceback.format_exc())
Exemplo n.º 22
0
class TaskService:
    def __init__(
            self,
            manager: ConnectionManager,
            tasksRepository: TasksRepository,
            taskScheduler: TaskScheduler,
            factorService: FactorService,
            stockService: StockService,
            crawlerRepository: CrawlerRepository
            ) -> None:
        self.tasksRepository = tasksRepository
        self.crawlerRepository = crawlerRepository
        self.manager = manager
        self.taskScheduler = taskScheduler
        self.factorService = factorService
        self.stockService = stockService
        self.logger = Logger("TaskService")
        self.ee = self.tasksRepository.taskEventEmitter
        self.setupEvents()
    
    def setupEvents(self) -> None:
        self.ee.on(EVENT_TASK_REPO_UPDATE_TASKS, self.fetchTasks)
        self.ee.on(EVENT_TASK_REPO_TASK_COMPLETE, self.updateTaskState)
        self.ee.on(EVENT_TASK_REPO_UPDATE_POOL_INFO, self.updateTaskPoolInfo)
    
    def getTaskSchedule(self, webSocket: WebSocket, isBroadCast: bool = False) -> None:
        jobs = self.taskScheduler.getJobs()
        stockTaskScheduleList = StockTaskScheduleList(**{"list": []})
        for i in range(len(jobs)):
            fields = jobs[i].trigger.fields
            id = jobs[i].id
            self.logger.info(f"jobargs: {str(jobs[i].args[0])}")
            stockTaskScheduleList.list.append(StockTaskScheduleInfo(**{
                "id": id,
                "year": str(fields[0]),
                "month": str(fields[1]),
                "day": str(fields[2]),
                "dayOfWeek": str(fields[4]),
                "hour": str(fields[5]),
                "minute": str(fields[6]),
                "second": str(fields[7]),
                "taskList": list(jobs[i].args[0])
            }))
        if isBroadCast:
            self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASK_SCHEDULE, stockTaskScheduleList.dict())
        else:
            self.manager.send(RES_SOCKET_TASK_FETCH_TASK_SCHEDULE, stockTaskScheduleList.dict(), webSocket)
    
    @staticmethod
    def marcapJob(marcapDtos: List[StockRunCrawling]) -> None:
        service: StockService = Locator.getInstance().get(StockService)
        logger = Logger("TaskService_marcapJob")
        for dto in marcapDtos:
            logger.info("#### schedule job start ####")
            logger.info("command" + dto.startDateStr + "~" + dto.endDateStr)
            dto.taskUniqueId = dto.taskId + dto.market+dto.startDateStr + dto.endDateStr + str(uuid.uuid4())
            if dto.isNow:
                dto.startDateStr = getNowDateStr()
                dto.endDateStr = getNowDateStr()
            logger.info("real:" + dto.startDateStr + "~" + dto.endDateStr)
        service.crawlingMarcapStockData(marcapDtos)
    
    def addTaskSchedule(self, scheduleDto: StockTaskSchedule, runCrawlingDto: List[StockRunCrawling], webSocket: WebSocket) -> None:
        marcapDtos = []
        for dto in runCrawlingDto:
            if dto.taskId == "marcap":
                marcapDtos.append(dto)
        
        self.taskScheduler.addJob(
            self.marcapJob, 
            scheduleDto.year, 
            scheduleDto.month, 
            scheduleDto.dayOfWeek,
            scheduleDto.day, 
            scheduleDto.hour, 
            scheduleDto.minute, 
            scheduleDto.second, 
            "marcap",
            args=[marcapDtos])
        self.getTaskSchedule(webSocket, True)
        
    def removeTaskSchedule(self, id: str, webSocket: WebSocket) -> None:
        self.taskScheduler.removeJob(id)
        self.getTaskSchedule(webSocket, True)
    
    def fetchTasks(self, data: ProcessTasks = None, websocket: WebSocket = None) -> None:
        if data is None:
            data = self.tasksRepository.tasksdto
        self.logger.info("fetchTasks", data.json())
        if websocket is None:
            self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASKS, data.dict())
        else:
            self.manager.send(RES_SOCKET_TASK_FETCH_TASKS, data.dict(), websocket)
        
    def getTaskState(self, taskId: str, webSocket: WebSocket) -> None:
        data: YearData = self.tasksRepository.getAllTaskState(taskId)
        self.manager.send(RES_SOCKET_TASK_FETCH_TASK_STATE, data.dict(), webSocket)

    def updateTaskState(self, taskId: str, stockUpdateState: StockUpdateState = None) -> None:
        if stockUpdateState is not None:
            self.manager.sendBroadCast(RES_SOCKET_TASK_UPDATE_TASK_STATE, stockUpdateState.dict())
        self.fetchTasks()

    def getTaskPoolInfo(self, webSocket: WebSocket) -> None:
        taskPoolInfo: TaskPoolInfo = self.tasksRepository.getPoolInfo()
        self.manager.send(RES_SOCKET_TASK_FETCH_TASK_POOL_INFO, taskPoolInfo.dict(), webSocket)
    
    def updateTaskPoolInfo(self, poolInfo: TaskPoolInfo) -> None:
        # logger.info(f"updateTaskPoolInfo:{poolInfo.json()}")
        self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASK_POOL_INFO, poolInfo.dict())
    
    def addTask(self, taskName: str, dto: Any) -> None:
        if isinstance(dto, dict):
            if taskName == "crawlingMarcapStockData":
                data = []
                for market in dto["market"]:
                    taskUniqueId = dto["taskId"]+market+dto["startDate"]+dto["endDate"]+str(uuid.uuid4())
                    dtoOne = StockRunCrawling(**{
                        "driverAddr": "http://fin-carwling-webdriver:4444",
                        "market": market,
                        "startDateStr": dto["startDate"],
                        "endDateStr": dto["endDate"],
                        "taskId": dto["taskId"],
                        "taskUniqueId": taskUniqueId
                    })
                    data.append(dtoOne)
            elif taskName == "convertFactorFileToDb":
                data = RunFactorFileConvert(**{
                    "taskId": dto["taskId"],
                    "taskUniqueId": dto["taskId"] + str(uuid.uuid4())
                })
            elif taskName == "crawlingFactorDartData":
                data = DartApiCrawling(**{
                    "apiKey": dto["apiKey"],
                    "isCodeNew": dto["isCodeNew"],
                    "startYear": dto["startYear"],
                    "endYear": dto["endYear"],
                    "taskId": dto["taskId"],
                    "taskUniqueId": dto["taskId"] + dto["startYear"] + dto["endYear"] + str(uuid.uuid4())
                })
        else:
            data = dto
        if taskName == "convertFactorFileToDb":
            self.factorService.convertFactorFileToDb(data)
        elif taskName == "crawlingMarcapStockData":
            self.stockService.crawlingMarcapStockData(data)
        elif taskName == "crawlingFactorDartData":
            self.factorService.crawlingFactorDartData(data)
    
    def cancelTask(self, taskId: str, taskUniqueId: str) -> None:
        if taskUniqueId in self.crawlerRepository.getCrawlers():
            self.crawlerRepository.getCrawler(taskUniqueId).isCancelled = True
        self.tasksRepository.taskRunner.cancel(taskUniqueId)
        task = self.tasksRepository.getTask(taskId, taskUniqueId)
        if task is not None:
            if task.state == "cancel":
                self.tasksRepository.deleteTask(task)
                self.tasksRepository.updateAllTask()
            elif task.state == "error":
                self.tasksRepository.deleteTask(task)
                self.tasksRepository.updateAllTask()
            else:
                task.state = "cancel"
                self.tasksRepository.updateTask(task)
        else:
            self.tasksRepository.updateAllTask()
    
    def fetchCompletedTask(self, dto: ListLimitData, webSocket: WebSocket) -> None:
        dao = ListLimitDao(**{
            "limit": dto.limit,
            "offset": dto.offset,
            "taskId": dto.taskId
        })
        tasks = self.tasksRepository.getCompletedTask(dao)
        self.manager.send(RES_SOCKET_TASK_FETCH_COMPLETED_TASK, tasks.dict(), webSocket)
Exemplo n.º 23
0
 def __init__(self) -> None:
     super().__init__()
     self.logger = Logger("TaskMongoDataSource")
Exemplo n.º 24
0
 def __init__(self) -> None:
     super().__init__()
     self.ee = EventEmitter()
     self.logger = Logger("MarcapCrawler")
Exemplo n.º 25
0
class FactorService:
    def __init__(self, manager: ConnectionManager,
                 factorRepository: FactorRepository,
                 tasksRepository: TasksRepository,
                 crawlerRepository: CrawlerRepository,
                 taskService: 'TaskService') -> None:
        self.manager = manager
        self.factorRepository = factorRepository
        self.tasksRepository = tasksRepository
        self.crawlerRepository = crawlerRepository
        self.taskService = taskService
        self.logger = Logger("FactorService")

    async def getFactor(self, code: str, year: str, month: str,
                        source: str) -> List[FactorData]:
        return await self.factorRepository.getFactor(code, year, month, source)

    def crawlingFactorDartData(self, dto: DartApiCrawling) -> None:
        async def crawlingFactorDartDataTask(pool: Pool,
                                             taskPool: TaskPool) -> None:
            # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
            try:
                crawler = DartApiCrawler()
                self.crawlerRepository.addCrawler(dto.taskUniqueId, crawler)
                self.createFactorDartListener(crawler.ee)
                await crawler.crawling(dto)
                self.crawlerRepository.removeCrawler(dto.taskUniqueId)
            except asyncio.CancelledError:
                self.logger.info("crawlingFactorDartDataTask", "cancel")
            except Exception:
                self.logger.error("crawlingFactorDartDataTask",
                                  f"error: {traceback.format_exc()}")
                self.tasksRepository.errorTask(dto, traceback.format_exc())
            finally:
                taskPool.removeTaskPool(pool)

        count = dto.endYear - dto.startYear + 1
        task = ProcessTask(
            **{
                "market": "",
                "startDateStr": dto.startYear,
                "endDateStr": dto.endYear,
                "taskUniqueId": dto.taskUniqueId,
                "taskId": dto.taskId,
                "count": count,
                "tasks": list(range(dto.startYear, dto.endYear + 1)),
                "restCount": count,
                "tasksRet": [0] * count,
                "state": "find worker"
            })
        self.tasksRepository.addTask(task)
        workerTask = Task(dto.taskUniqueId, crawlingFactorDartDataTask)
        self.tasksRepository.runTask(workerTask)

    # file에 있는 factor를 db에 저장한다.
    def convertFactorFileToDb(self, dto: RunFactorFileConvert) -> None:
        self.logger.info("convertFactorFileToDb")

        async def convertFactorFileToDbTask(pool: Pool,
                                            taskPool: TaskPool) -> None:
            try:
                task = self.tasksRepository.getTask(dto.taskId,
                                                    dto.taskUniqueId)
                data = await asyncio.create_task(
                    self.factorRepository.getFactorsInFile())
                task.state = "make Factor Object"
                self.tasksRepository.updateTask(task)
                daoList = await batchFunction(100, data,
                                              self.makeFactorDaoList)
                task.state = "start insert db"
                self.tasksRepository.updateTask(task)
                self.logger.info("convertFactorFileToDbTask",
                                 f"insertCount: {str(len(daoList))}")
                await self.factorRepository.insertFactor(daoList)
                task.state = "complete"
                self.tasksRepository.completeFactorConvertFileToDbTask(task)
            except asyncio.CancelledError:
                self.logger.info("convertFactorFileToDbTask", "cancel")
            except Exception:
                self.logger.error("convertFactorFileToDbTask",
                                  f"error: {traceback.format_exc()}")
                self.tasksRepository.errorTask(dto, traceback.format_exc())
            finally:
                taskPool.removeTaskPool(pool)

        task = ProcessTask(
            **{
                "market": "",
                "startDateStr": "20070101",
                "endDateStr": "20191231",
                "taskUniqueId": dto.taskUniqueId,
                "taskId": dto.taskId,
                "count": 1,
                "tasks": ["convert"],
                "restCount": 1,
                "tasksRet": [0],
                "state": "start get file"
            })
        self.tasksRepository.addTask(task)
        workerTask = Task(dto.taskUniqueId, convertFactorFileToDbTask)
        self.tasksRepository.runTask(workerTask)

    async def makeFactorDaoList(self, data: List[Dict]) -> List[FactorDao]:
        daoList = []
        for one in data:
            dao = FactorDao(
                **{
                    "code":
                    one["종목코드"],  # 종목코드
                    "name":
                    one["종목명"],  # 종목이름
                    "dataYear":
                    one["년"],  # 결산년
                    "dataMonth":
                    one["결산월"],  # 결산월
                    "dataName":
                    one["데이터명"],  # 데이터명
                    "dataValue": (
                        one["데이터값"] *
                        1000) if one["단위"] == "천원" else one["데이터값"]  # 데이터값
                })
            daoList.append(dao)
        return daoList

    def createFactorDartListener(self, ee: EventEmitter) -> None:
        ee.on(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES,
              self.onDownloadingCodes)
        ee.on(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA,
              self.onCrawlingFactorData)
        ee.on(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, self.onCompleteYear)
        ee.on(EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR,
              self.onResultOfFactor)
        ee.on(EVENT_DART_API_CRAWLING_ON_CANCEL, self.onCancelled)

    def onDownloadingCodes(self, dto: DartApiCrawling) -> None:
        self.logger.info("onDownloadingCodes", dto.taskUniqueId)
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "download Codes"
        self.tasksRepository.updateTask(task)

    def onCrawlingFactorData(self, dto: DartApiCrawling) -> None:
        self.logger.info("onCrawlingFactorData", dto.taskUniqueId)
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "crawling factor data"
        self.tasksRepository.updateTask(task)

    def onCompleteYear(self, dto: DartApiCrawling, year: int) -> None:
        self.logger.info("onCompleteYear", dto.taskUniqueId)
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        self.tasksRepository.completeFactorDart(task, year)

    def onResultOfFactor(self, dto: DartApiCrawling, year: int,
                         obj: List) -> None:
        self.logger.info("onResultOfFactor", dto.taskUniqueId)
        listOfFactorDao = list(
            map(
                lambda one: FactorDao(
                    **{
                        "code": one["crawling_code"],
                        "name": one["crawling_name"],
                        "dataYear": one["bsns_year"],
                        "dataMonth": getMonthFromReprtCode(one["reprt_code"]),
                        "dataName": one["account_nm"],
                        "dataValue": one["thstrm_amount"],
                        "dataId": one["account_id"]
                    }), obj))
        asyncio.create_task(
            self.factorRepository.insertFactorDart(listOfFactorDao))

    def onCancelled(self, dto: DartApiCrawling) -> None:
        self.logger.info("onCancelled")
Exemplo n.º 26
0
from typing import Dict
from app.module.logger import Logger
from pymongo import ASCENDING, MongoClient, monitoring
from pymongo.collection import Collection
from pymongo.database import Database
from pymongo.monitoring import (CommandFailedEvent, CommandStartedEvent,
                                CommandSucceededEvent)

from dotenv import dotenv_values

log = Logger("MongoDataSource", "mongoDb")
config = dotenv_values('.env')


class CommandLogger(monitoring.CommandListener):
    def started(self, event: CommandStartedEvent) -> None:
        pass
        # log.info("started", "Command {0.command_name} with request id ""{0.request_id} started on server ""{0.connection_id}".format(event))

    def succeeded(self, event: CommandSucceededEvent) -> None:
        pass
        # log.info("succeeded", "Command {0.command_name} with request id ""{0.request_id} on server {0.connection_id} ""succeeded in {0.duration_micros} ""microseconds".format(event))

    def failed(self, event: CommandFailedEvent) -> None:
        pass
        # log.info("failed", "Command {0.command_name} with request id ""{0.request_id} on server {0.connection_id} ""failed in {0.duration_micros} ""microseconds".format(event))


monitoring.register(CommandLogger())

Exemplo n.º 27
0
class TaskRunner(object):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("TaskRunner")
        self.queue: asyncio.Queue = asyncio.Queue()
        self.loop = asyncio.get_running_loop()
        self.pool = TaskPool(notifyCallback=self.notifyRmOnPool)
        self.notifyCallback = None
        # self.loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()

    def getPoolInfo(self) -> TaskPoolInfo:
        return TaskPoolInfo(
            **{
                "poolSize": self.pool.poolSize,
                "poolCount": self.pool.poolCount(),
                "runCount": self.pool.runCount(),
                "queueCount": self.queue.qsize()
            })

    def updatePoolInfo(self) -> None:
        self.logger.info(
            "updatePoolInfo",
            f"runCount:{self.pool.runCount()}, queueCount:{self.queue.qsize()}"
        )
        if self.notifyCallback:
            self.notifyCallback(
                TaskPoolInfo(
                    **{
                        "poolSize": self.pool.poolSize,
                        "poolCount": self.pool.poolCount(),
                        "runCount": self.pool.runCount(),
                        "queueCount": self.queue.qsize()
                    }))

    def notifyPutOnQueue(self) -> None:
        self.loop.create_task(self.notifyToPool())

    def notifyRmOnPool(self) -> None:
        if self.queue.qsize() > 0:
            self.loop.create_task(self.notifyToPool())
        else:
            self.updatePoolInfo()

    def cancel(self, id: str) -> None:
        pool: Optional[Pool] = self.pool.findPool(id)
        if pool is not None:
            self.logger.info("cancel", id)
            pool.cancel()
            self.pool.removeTaskPool(pool)
        else:
            self.logger.info("cancel", "pool is not exist")

    def isExist(self, id: str) -> bool:
        return self.pool.findPool(id) is not None

    async def notifyToPool(self) -> None:
        try:
            if self.queue.qsize() > 0 and (self.pool.poolSize -
                                           self.pool.poolCount()) > 0:
                pool = self.pool.addTaskPool(Pool(), False)
                # timeout이 있으면 nonblocking으로 움직임
                task: Task = await asyncio.wait_for(self.queue.get(),
                                                    timeout=1)
                if task:
                    pool.setTask(task)
                    pool.run(self.pool)
                else:
                    self.pool.removeTaskPool(pool, False)
            # if self.pool.poolSize > self.queue.qsize() and self.pool.poolCount() >= self.queue.qsize():
            #     print("exit")
            # elif self.pool.poolSize > self.pool.poolCount() and self.queue.qsize() > 0:
            #     pool = self.pool.addTaskPool(Pool(), False)
            #     print(f"before qsize:{self.queue.qsize()}")
            #     task: Task = await asyncio.wait_for(self.queue.get(), timeout=1)
            #     print(f"after qsize:{self.queue.qsize()}")
            #     if task:
            #         pool.setTask(task)
            #         pool.run(self.pool)
            #     else:
            #         self.pool.removeTaskPool(pool, False)
        except asyncio.TimeoutError as e:
            self.logger.info("notifyToPool", f"timeout:{str(e)}")
            self.pool.removeTaskPool(pool, False)
        finally:
            self.updatePoolInfo()

    def put(self, task: Task) -> None:
        task.loop = self.loop
        self.loop.create_task(self._put(task))

    async def _put(self, task: Task) -> None:
        self.logger.info("_put", "task put")
        await self.queue.put(task)
        self.notifyPutOnQueue()