def first_run_history2(self): logger.debug(str(self.fetcher_id) + " _ "+str(len(self.dataid))) self.timeStr = self.buildQueryTimeStr() data_count = 0 conn = None for index in range(self.fetcher_id, len(self.dataid), config.fetcher_num): meta = Metadata.getMetaData(self.dataid[index], self.timeStr) conn = DBUtil.createConnection() DBUtil.UpdateDataSetToProcessed(conn, self.dataid[index]) data_count = data_count + 1 # logger.debug("Fetcher {" + str(self.fetcher_id)+"} query {" + str(data_count) + "} data set " # + self.dataid[index] + # " has {" + str(len(meta)) + "} resource") logger.debug("Fetcher [{}] query [{}] dataset [{}] @ dataid[{}] + has [{}] resource" .format(str(self.fetcher_id), str(data_count), self.dataid[index], index, str(len(meta)))) for m in meta: DBUtil.InsertResourceURL(conn, m.getDataSetID(),m.getFileID() ,m.getDownloadURL(), m.getFormat()) logger.debug( "Fetcher {" + str(self.fetcher_id) + "} " + m.getDownloadURL() + " " + m.getFormat() + " " + m.getDataSetID() + " " + m.getFileID() + " " + m.getResourceID() ) DBUtil.closeConnection(conn)
def fetchNewMetadata(self): dataid = self.findUpdateDataID() for s in dataid: meta = Metadata.getMetaData(s) print(dataid) for md in meta: logger.debug(md.getResourceID() + " put to queue") self.queue.put(md)
def process_history(self): """ process history data since last fetch """ logger.info(str(threading.get_ident()) + " process hisotry") conn = DBUtil.createConnection() latestTime = DBUtil.getLastUpdateEpoch(conn) DBUtil.closeConnection(conn) dataid = self.findUpdateDataID(latestTime) logger.debug(len(dataid)) dataid_count = 0 meta_count = 0 for s in dataid: dataid_count = dataid_count +1 logger.info("data count = " + str(dataid_count)) meta = Metadata.getMetaData(s, self.timeStr) for md in meta: meta_count = meta_count + 1 logger.info("meta_count = "+ str(meta_count)) logger.debug(md.getResourceID() + " put to queue") self.queue.put(md)
def first_run_history2(self): logger.debug(str(self.fetcher_id) + " _ "+str(len(self.dataid))) self.timeStr = self.buildQueryTimeStr() data_count = 0 conn = None for index in range(self.fetcher_id, len(self.dataid), config.fetcher_num): meta = Metadata.getMetaData(self.dataid[index], self.timeStr) conn = DBUtil.createConnection() # To solve the restart problem # Solution: all processed flag will be set after download # DBUtil.UpdateDataSetToProcessed(conn, self.dataid[index]) data_count = data_count + 1 # logger.debug("Fetcher {" + str(self.fetcher_id)+"} query {" + str(data_count) + "} data set " # + self.dataid[index] + # " has {" + str(len(meta)) + "} resource") logger.debug("Fetcher [{}] query [{}] dataset [{}] @ dataid[{}] + has [{}] resource" .format(str(self.fetcher_id), str(data_count), self.dataid[index], index, str(len(meta)))) for m in meta: if DBUtil.isResourceURLExist(conn,m.getDataSetID(),m.getResourceID() ,m.getDownloadURL(), m.getFormat()) is False: DBUtil.InsertResourceURL(conn, m.getDataSetID(),m.getResourceID() ,m.getDownloadURL(), m.getFormat()) # building a downloadData and using queue to get the downloadData row = downloadData(m.getDownloadURL(),m.getFormat(),m.getDataSetID(),m.getResourceID()) self.queue.put(row) logger.debug( "Fetcher {" + str(self.fetcher_id) + "} " + m.getDownloadURL() + " " + m.getFormat() + " " + m.getDataSetID() + " " + m.getResourceID() ) DBUtil.closeConnection(conn)