def loadFlatLcDataset(params: dict, dbParams: dict, table: str, limit: float): """Loads and aggregates light curves from single csv file of individual data points storing results in a database.""" dataPath = joinRoot(params["relativePath"]) logger.info("Loading from: %s", dataPath) skiprows = params["skiprows"] commitFrequency = dbParams["commitFrequency"] dataName = params["dataName"] logger.info("Using %s LC adapter", dataName) if dataName == "ogle3": adapter = Ogle3Adapter elif dataName == "macho": adapter = MachoAdapter elif dataName == "k2": adapter = K2Adapter else: raise ValueError("Unsupported dataName: %s" % dataName) conn = connFromParams(dbParams) cursor = conn.cursor() reportTableCount(cursor, table, msg="before loading") insertOrReplaceQuery = INSERT_REPLACE_INTO_LCS % table with open(dataPath, "r") as f: reader = csv.reader(f, delimiter=",") for _ in range(skiprows): next(reader) completedLcs = 0 uid = label = times = mags = errors = None for row in reader: if adapter.rowEquals(row, uid): # continue building current LC adapter.appendRow(times, mags, errors, row) else: if uid is not None: # finish current LC, except for first time args = (uid, label) + serLc(times, mags, errors) cursor.execute(insertOrReplaceQuery, args) completedLcs += 1 if logger.isEnabledFor(logging.DEBUG): logger.debug("completed lc with len: %s", len(times)) if not completedLcs % commitFrequency: logger.info("committing progress: %s", completedLcs) conn.commit() if completedLcs >= limit: break # initialize new LC uid, label, times, mags, errors = adapter.initLcFrom(row) logger.info("committing progress: %s", completedLcs) conn.commit() reportTableCount(cursor, table, msg="after loading") conn.close()
def main(): start = time.time() args = _clargs() dbParams = {"dbPath": args.dbPath, "timeout": DB_TIMEOUT} conn = connFromParams(dbParams) cursor = conn.cursor() _SELECT_SQL = "SELECT * FROM %s WHERE id=\"%s\"" % (TABLE_NAME, args.id) cursor.execute(_SELECT_SQL) try: row = next(cursor) except StopIteration: print("Found no LCs!") return conn.close() times, mag, err = deserLc(*row[2:]) times = times[START_SLICE:END_SLICE] mag = mag[START_SLICE:END_SLICE] err = err[START_SLICE:END_SLICE] i = 0 skipped = list() # fts = registered_extractors() fts = featuresByData(STANDARD_INPUT_DATA_TYPES) if args.feature: # Option to test specific feature only assert args.feature in fts fts = [args.feature] for i, featureName in enumerate(fts): fs = FeatureSpace(data=STANDARD_INPUT_DATA_TYPES, only=[featureName]) try: fts, values = fs.extract(times, mag, err) except BaseException as e: traceback.print_exc() print("failed for feature: %s with exception: %s" % (featureName, e)) break if len(fts) and len(values): msg = "OK" if np.all(np.isnan(values)) else "NOT FINITE!" print("%s %s: %s" % (msg, fts[0], values[0])) if len(values) != 1: print("lengths: features: %s values: %s" % (len(fts), len(values))) else: skipped.append(featureName) time.sleep(0.2) print("total %s skipped: %s" % (i, len(skipped))) print("skipped: %s" % skipped) print("elapsed: %s" % timedelta(seconds=time.time() - start))
def feetsExtractFeatures(extractParams: dict, dbParams: dict, lcTable: str, featuresTable: str, limit: int): """Runs light curves through 'feets' library obtaining feature vectors. Perfoms the extraction using multiprocessing. Output order of jobs will not necessarily correspond to input order, therefore, class labels are returned with corresponding feature vectors to avoid confusion. :param extractParams: extract parameters :param dbParams: db parameters :param lcTable: name of lc table :param featuresTable: name of features table :param limit: upper limit on the number of LC processed :returns feature vectors for each LC and list of corresponding class labels """ # recommended excludes (slow): "CAR_mean", "CAR_sigma", "CAR_tau" # also produces nan's: "ls_fap" logger.info("Excluded features: %s", extractParams["excludedFeatures"]) fs = getFeatureSpace(extractParams) ciFreq = dbParams["commitFrequency"] conn = connFromParams(dbParams) cursor = conn.cursor() insertOrReplQry = INSERT_REPLACE_INTO_FEATURES % featuresTable reportTableCount(cursor, featuresTable, msg="before extracting") offset = extractParams.get("offset", 0) logger.info("Beginning extraction at offset: %s in LC table", offset) jobs = feetsJobGenerator(fs, dbParams, lcTable, offset=offset) lcCount = 0 dbExceptions = 0 for uid, label, ftNames, features in reportingImapUnordered( feetsExtract, jobs): # loop variables come from lcml.utils.multiprocess._feetsExtract args = (uid, label, serArray(features)) try: cursor.execute(insertOrReplQry, args) if lcCount % ciFreq == 0: logger.info("commit progress: %s", lcCount) conn.commit() except OperationalError: logger.exception("Failed to insert %s", args) dbExceptions += 1 if lcCount > limit: break lcCount += 1 reportTableCount(cursor, featuresTable, msg="after extracting") conn.commit() conn.close() if dbExceptions: logger.warning("Db exception count: %s", dbExceptions)
def feetsJobGenerator(fs: FeatureSpace, dbParams: dict, tableName: str, selRows: str = "*", offset: int = 0): """Returns a generator of tuples of the form: (featureSpace (feets.FeatureSpace), id (str), label (str), times (ndarray), mags (ndarray), errors(ndarray)) Each tuple is used to perform a 'feets' feature extraction job. :param fs: feets.FeatureSpace object required to perform extraction :param dbParams: additional params :param tableName: table containing light curves :param selRows: which rows to select from clean LC table :param offset: number of light curves to skip in db table before processing """ pageSize = dbParams["pageSize"] conn = connFromParams(dbParams) cursor = conn.cursor() column = "id" # PK previousId = "" # low precedence text value rows = True while rows: _fmtPrevId = "\"{}\"".format(previousId) q = SINGLE_COL_PAGED_SELECT_QRY.format(selRows, tableName, column, _fmtPrevId, pageSize, offset) cursor.execute(q) rows = cursor.fetchall() for r in rows: times, mags, errors = deserLc(*r[2:]) # intended args for lcml.utils.multiprocess._feetsExtract yield (fs, r[0], r[1], times, mags, errors) if rows: previousId = rows[-1][0] conn.close()
def cleanLightCurves(params: dict, dbParams: dict, rawTable: str, cleanTable: str, limit: float): """Clean lightcurves and report details on discards.""" removes = set(params["filter"]) if "filter" in params else set() removes = removes.union(NON_FINITE_VALUES) stdLimit = params.get("stdLimit", DEFAULT_STD_LIMIT) errorLimit = params.get("errorLimit", DEFAULT_ERROR_LIMIT) commitFrequency = dbParams["commitFrequency"] conn = connFromParams(dbParams) cursor = conn.cursor() reportTableCount(cursor, cleanTable, msg="before cleaning") insertOrReplace = INSERT_REPLACE_INTO_LCS % cleanTable totalLcs = tableCount(cursor, rawTable) if limit != float("inf"): totalLcs = max(totalLcs, limit) shortIssueCount = 0 bogusIssueCount = 0 outlierIssueCount = 0 insertCount = 0 scaler = StandardScaler(copy=False) standardize = params.get("standardize", False) itr = singleColPagingItr(cursor, rawTable, columnName="id", columnIndex=0, columnEscaped=True) for i, r in enumerate(itr): times, mags, errors = deserLc(*r[2:]) lc, issue, _ = preprocessLc(times, mags, errors, removes=removes, stdLimit=stdLimit, errorLimit=errorLimit) if lc: if standardize: lc[1] = _standardizeArray(scaler, lc[1]) lc[2] = _standardizeArray(scaler, lc[2]) args = (r[0], r[1]) + serLc(*lc) cursor.execute(insertOrReplace, args) insertCount += 1 if insertCount % commitFrequency == 0: logger.info("progress: %s", insertCount) conn.commit() elif issue == INSUFFICIENT_DATA_REASON: shortIssueCount += 1 elif issue == BOGUS_DATA_REASON: bogusIssueCount += 1 elif issue == OUTLIERS_REASON: outlierIssueCount += 1 else: raise ValueError("Bad reason: %s" % issue) if i >= limit: break reportTableCount(cursor, cleanTable, msg="after cleaning") conn.commit() conn.close() passRate = fmtPct(insertCount, totalLcs) shortRate = fmtPct(shortIssueCount, totalLcs) bogusRate = fmtPct(bogusIssueCount, totalLcs) outlierRate = fmtPct(outlierIssueCount, totalLcs) logger.info("Dataset size: %d Pass rate: %s", totalLcs, passRate) logger.info("Discard rates: short: %s bogus: %s outlier: %s", shortRate, bogusRate, outlierRate)