示例#1
0
def loadFlatLcDataset(params: dict, dbParams: dict, table: str, limit: float):
    """Loads and aggregates light curves from single csv file of individual data
    points storing results in a database."""
    dataPath = joinRoot(params["relativePath"])
    logger.info("Loading from: %s", dataPath)
    skiprows = params["skiprows"]
    commitFrequency = dbParams["commitFrequency"]

    dataName = params["dataName"]
    logger.info("Using %s LC adapter", dataName)
    if dataName == "ogle3":
        adapter = Ogle3Adapter
    elif dataName == "macho":
        adapter = MachoAdapter
    elif dataName == "k2":
        adapter = K2Adapter
    else:
        raise ValueError("Unsupported dataName: %s" % dataName)

    conn = connFromParams(dbParams)

    cursor = conn.cursor()
    reportTableCount(cursor, table, msg="before loading")
    insertOrReplaceQuery = INSERT_REPLACE_INTO_LCS % table
    with open(dataPath, "r") as f:
        reader = csv.reader(f, delimiter=",")
        for _ in range(skiprows):
            next(reader)

        completedLcs = 0
        uid = label = times = mags = errors = None
        for row in reader:
            if adapter.rowEquals(row, uid):
                # continue building current LC
                adapter.appendRow(times, mags, errors, row)
            else:
                if uid is not None:
                    # finish current LC, except for first time
                    args = (uid, label) + serLc(times, mags, errors)
                    cursor.execute(insertOrReplaceQuery, args)
                    completedLcs += 1
                    if logger.isEnabledFor(logging.DEBUG):
                        logger.debug("completed lc with len: %s", len(times))

                    if not completedLcs % commitFrequency:
                        logger.info("committing progress: %s", completedLcs)
                        conn.commit()

                    if completedLcs >= limit:
                        break

                # initialize new LC
                uid, label, times, mags, errors = adapter.initLcFrom(row)

    logger.info("committing progress: %s", completedLcs)
    conn.commit()
    reportTableCount(cursor, table, msg="after loading")
    conn.close()
示例#2
0
def main():
    start = time.time()
    args = _clargs()

    dbParams = {"dbPath": args.dbPath, "timeout": DB_TIMEOUT}
    conn = connFromParams(dbParams)
    cursor = conn.cursor()

    _SELECT_SQL = "SELECT * FROM %s WHERE id=\"%s\"" % (TABLE_NAME, args.id)
    cursor.execute(_SELECT_SQL)
    try:
        row = next(cursor)
    except StopIteration:
        print("Found no LCs!")
        return

    conn.close()
    times, mag, err = deserLc(*row[2:])

    times = times[START_SLICE:END_SLICE]
    mag = mag[START_SLICE:END_SLICE]
    err = err[START_SLICE:END_SLICE]

    i = 0
    skipped = list()
    # fts = registered_extractors()
    fts = featuresByData(STANDARD_INPUT_DATA_TYPES)
    if args.feature:
        # Option to test specific feature only
        assert args.feature in fts
        fts = [args.feature]

    for i, featureName in enumerate(fts):
        fs = FeatureSpace(data=STANDARD_INPUT_DATA_TYPES, only=[featureName])
        try:
            fts, values = fs.extract(times, mag, err)
        except BaseException as e:
            traceback.print_exc()
            print("failed for feature: %s with exception: %s" %
                  (featureName, e))
            break

        if len(fts) and len(values):
            msg = "OK" if np.all(np.isnan(values)) else "NOT FINITE!"
            print("%s %s: %s" % (msg, fts[0], values[0]))
            if len(values) != 1:
                print("lengths: features: %s values: %s" %
                      (len(fts), len(values)))
        else:
            skipped.append(featureName)

    time.sleep(0.2)
    print("total %s skipped: %s" % (i, len(skipped)))
    print("skipped: %s" % skipped)
    print("elapsed: %s" % timedelta(seconds=time.time() - start))
示例#3
0
def feetsExtractFeatures(extractParams: dict, dbParams: dict, lcTable: str,
                         featuresTable: str, limit: int):
    """Runs light curves through 'feets' library obtaining feature vectors.
    Perfoms the extraction using multiprocessing. Output order of jobs will not
    necessarily correspond to input order, therefore, class labels are returned
    with corresponding feature vectors to avoid confusion.

    :param extractParams: extract parameters
    :param dbParams: db parameters
    :param lcTable: name of lc table
    :param featuresTable: name of features table
    :param limit: upper limit on the number of LC processed
    :returns feature vectors for each LC and list of corresponding class labels
    """
    # recommended excludes (slow): "CAR_mean", "CAR_sigma", "CAR_tau"
    # also produces nan's: "ls_fap"
    logger.info("Excluded features: %s", extractParams["excludedFeatures"])
    fs = getFeatureSpace(extractParams)

    ciFreq = dbParams["commitFrequency"]
    conn = connFromParams(dbParams)
    cursor = conn.cursor()
    insertOrReplQry = INSERT_REPLACE_INTO_FEATURES % featuresTable
    reportTableCount(cursor, featuresTable, msg="before extracting")

    offset = extractParams.get("offset", 0)
    logger.info("Beginning extraction at offset: %s in LC table", offset)

    jobs = feetsJobGenerator(fs, dbParams, lcTable, offset=offset)
    lcCount = 0
    dbExceptions = 0
    for uid, label, ftNames, features in reportingImapUnordered(
            feetsExtract, jobs):
        # loop variables come from lcml.utils.multiprocess._feetsExtract
        args = (uid, label, serArray(features))
        try:
            cursor.execute(insertOrReplQry, args)
            if lcCount % ciFreq == 0:
                logger.info("commit progress: %s", lcCount)
                conn.commit()
        except OperationalError:
            logger.exception("Failed to insert %s", args)
            dbExceptions += 1

        if lcCount > limit:
            break

        lcCount += 1

    reportTableCount(cursor, featuresTable, msg="after extracting")
    conn.commit()
    conn.close()

    if dbExceptions:
        logger.warning("Db exception count: %s", dbExceptions)
示例#4
0
def feetsJobGenerator(fs: FeatureSpace,
                      dbParams: dict,
                      tableName: str,
                      selRows: str = "*",
                      offset: int = 0):
    """Returns a generator of tuples of the form:
    (featureSpace (feets.FeatureSpace),  id (str), label (str), times (ndarray),
     mags (ndarray), errors(ndarray))
    Each tuple is used to perform a 'feets' feature extraction job.

    :param fs: feets.FeatureSpace object required to perform extraction
    :param dbParams: additional params
    :param tableName: table containing light curves
    :param selRows: which rows to select from clean LC table
    :param offset: number of light curves to skip in db table before processing
    """
    pageSize = dbParams["pageSize"]
    conn = connFromParams(dbParams)
    cursor = conn.cursor()

    column = "id"  # PK
    previousId = ""  # low precedence text value
    rows = True
    while rows:
        _fmtPrevId = "\"{}\"".format(previousId)
        q = SINGLE_COL_PAGED_SELECT_QRY.format(selRows, tableName, column,
                                               _fmtPrevId, pageSize, offset)
        cursor.execute(q)
        rows = cursor.fetchall()
        for r in rows:
            times, mags, errors = deserLc(*r[2:])
            # intended args for lcml.utils.multiprocess._feetsExtract
            yield (fs, r[0], r[1], times, mags, errors)

        if rows:
            previousId = rows[-1][0]

    conn.close()
示例#5
0
def cleanLightCurves(params: dict, dbParams: dict, rawTable: str,
                     cleanTable: str, limit: float):
    """Clean lightcurves and report details on discards."""
    removes = set(params["filter"]) if "filter" in params else set()
    removes = removes.union(NON_FINITE_VALUES)
    stdLimit = params.get("stdLimit", DEFAULT_STD_LIMIT)
    errorLimit = params.get("errorLimit", DEFAULT_ERROR_LIMIT)
    commitFrequency = dbParams["commitFrequency"]
    conn = connFromParams(dbParams)
    cursor = conn.cursor()
    reportTableCount(cursor, cleanTable, msg="before cleaning")
    insertOrReplace = INSERT_REPLACE_INTO_LCS % cleanTable
    totalLcs = tableCount(cursor, rawTable)
    if limit != float("inf"):
        totalLcs = max(totalLcs, limit)

    shortIssueCount = 0
    bogusIssueCount = 0
    outlierIssueCount = 0
    insertCount = 0
    scaler = StandardScaler(copy=False)
    standardize = params.get("standardize", False)
    itr = singleColPagingItr(cursor,
                             rawTable,
                             columnName="id",
                             columnIndex=0,
                             columnEscaped=True)
    for i, r in enumerate(itr):
        times, mags, errors = deserLc(*r[2:])
        lc, issue, _ = preprocessLc(times,
                                    mags,
                                    errors,
                                    removes=removes,
                                    stdLimit=stdLimit,
                                    errorLimit=errorLimit)
        if lc:
            if standardize:
                lc[1] = _standardizeArray(scaler, lc[1])
                lc[2] = _standardizeArray(scaler, lc[2])

            args = (r[0], r[1]) + serLc(*lc)
            cursor.execute(insertOrReplace, args)
            insertCount += 1
            if insertCount % commitFrequency == 0:
                logger.info("progress: %s", insertCount)
                conn.commit()

        elif issue == INSUFFICIENT_DATA_REASON:
            shortIssueCount += 1
        elif issue == BOGUS_DATA_REASON:
            bogusIssueCount += 1
        elif issue == OUTLIERS_REASON:
            outlierIssueCount += 1
        else:
            raise ValueError("Bad reason: %s" % issue)

        if i >= limit:
            break

    reportTableCount(cursor, cleanTable, msg="after cleaning")
    conn.commit()
    conn.close()

    passRate = fmtPct(insertCount, totalLcs)
    shortRate = fmtPct(shortIssueCount, totalLcs)
    bogusRate = fmtPct(bogusIssueCount, totalLcs)
    outlierRate = fmtPct(outlierIssueCount, totalLcs)
    logger.info("Dataset size: %d Pass rate: %s", totalLcs, passRate)
    logger.info("Discard rates: short: %s bogus: %s outlier: %s", shortRate,
                bogusRate, outlierRate)