예제 #1
0
def fromRelativePath(relPath: str) -> BatchPipeline:
    """Constructs a pipeline from config found at relative path. Relative config
    overwrites general config found at `$LCML/conf/common/pipeline.json`

    :param relPath: relative path to specific config overriding default config
    :return: constructed BatchPipeline object
    """
    defaultConf = loadJson(joinRoot(_DEFAULT_PIPE_CONF_REL_PATH))
    relConf = loadJson(joinRoot(relPath))
    conf = recursiveMerge(defaultConf.copy(), relConf)
    t = PrettyTable(["global param", "value"])
    t.align = "l"
    for k, v in sorted(conf[GLOBAL_PARAMS].items()):
        t.add_row([k, v])
    logger.info("Global params\n%s", str(t))
    pipeConf = loadPipelineConf(conf)
    pipeType = pipeConf.globalParams["type"]
    if pipeType == "supervised":
        pipe = SupervisedPipeline(pipeConf)
    elif pipeType == "unsupervised":
        pipe = UnsupervisedPipeline(pipeConf)
    else:
        raise ValueError("unsupported pipeline type: %s" % pipeType)

    return pipe
예제 #2
0
def main():
    inPath = joinRoot("data/macho/macho-classifications.csv")
    outDir = joinRoot("data/macho/class")
    commandBase = tapCommandBase()
    query = ("SELECT dateobs, rmag, rerr, bmag, berr "
             "FROM public.photometry_view "
             "WHERE fieldid=%s AND tileid=%s AND seqn=%s")

    classCounts = defaultdict(int)
    classData = np.loadtxt(fname=inPath, dtype=int, delimiter=",", skiprows=1)
    logger.critical("processing %d requests", len(classData))
    smallestLcNoRetry = 20
    for field, tile, seqn, classif in classData:
        classCounts[classif] += 1
        fname = "field=%s_tile=%s_seqn=%s_class=%s" % (field, tile, seqn,
                                                       classif)
        outPath = os.path.join(outDir, fname + ".csv")
        if os.path.exists(outPath):
            _tempData = np.loadtxt(outPath, dtype=str, delimiter=",")
            if len(_tempData) > smallestLcNoRetry:
                # skip downlaod if we already have a file with sufficient data
                # logger.critical("skipping %s", fname)
                continue

        logger.critical(outPath)
        fullQuery = query % (field, tile, seqn)
        cmd = commandBase + ["adql=" + fullQuery, "out=" + outPath]
        try:
            subprocess.check_output(cmd)
        except CalledProcessError:
            logger.exception("JAR call failed")
            continue

    # +----------+--------+------------+
    # | Category | Counts | Percentage |
    # +----------+--------+------------+
    # | 1 | 7405 | 34.48 |
    # | 2 | 1765 | 8.22 |
    # | 3 | 315 | 1.47 |
    # | 4 | 1185 | 5.52 |
    # | 5 | 683 | 3.18 |
    # | 6 | 315 | 1.47 |
    # | 7 | 822 | 3.83 |
    # | 8 | 1134 | 5.28 |
    # | 9 | 778 | 3.62 |
    # | 10 | 6835 | 31.83 |
    # | 11 | 237 | 1.1 |
    # +----------+--------+------------+
    t = PrettyTable(["Category", "Counts", "Percentage"])
    totalCounts = sum(classCounts.values())
    for cat, counts in sorted(classCounts.items()):
        t.add_row([cat, counts, round(100.0 * counts / totalCounts, 2)])
    logger.critical("\n" + str(t))
예제 #3
0
def main():
    start = time.time()
    args = _getArgs()
    dataset = "macho"
    dataDir = joinRoot("data", dataset)
    outDir = joinRoot("results", dataset)
    if not os.path.exists(outDir):
        os.makedirs(outDir)

    logger.info("Loading RF classifier...")
    randomForestModel = upsilon.load_rf_model()
    runDataset(dataDir, randomForestModel, outDir, args.threads, args.rows)
    logger.info("finished in: %.2fs", time.time() - start)
예제 #4
0
def loadFlatLcDataset(params: dict, dbParams: dict, table: str, limit: float):
    """Loads and aggregates light curves from single csv file of individual data
    points storing results in a database."""
    dataPath = joinRoot(params["relativePath"])
    logger.info("Loading from: %s", dataPath)
    skiprows = params["skiprows"]
    commitFrequency = dbParams["commitFrequency"]

    dataName = params["dataName"]
    logger.info("Using %s LC adapter", dataName)
    if dataName == "ogle3":
        adapter = Ogle3Adapter
    elif dataName == "macho":
        adapter = MachoAdapter
    elif dataName == "k2":
        adapter = K2Adapter
    else:
        raise ValueError("Unsupported dataName: %s" % dataName)

    conn = connFromParams(dbParams)

    cursor = conn.cursor()
    reportTableCount(cursor, table, msg="before loading")
    insertOrReplaceQuery = INSERT_REPLACE_INTO_LCS % table
    with open(dataPath, "r") as f:
        reader = csv.reader(f, delimiter=",")
        for _ in range(skiprows):
            next(reader)

        completedLcs = 0
        uid = label = times = mags = errors = None
        for row in reader:
            if adapter.rowEquals(row, uid):
                # continue building current LC
                adapter.appendRow(times, mags, errors, row)
            else:
                if uid is not None:
                    # finish current LC, except for first time
                    args = (uid, label) + serLc(times, mags, errors)
                    cursor.execute(insertOrReplaceQuery, args)
                    completedLcs += 1
                    if logger.isEnabledFor(logging.DEBUG):
                        logger.debug("completed lc with len: %s", len(times))

                    if not completedLcs % commitFrequency:
                        logger.info("committing progress: %s", completedLcs)
                        conn.commit()

                    if completedLcs >= limit:
                        break

                # initialize new LC
                uid, label, times, mags, errors = adapter.initLcFrom(row)

    logger.info("committing progress: %s", completedLcs)
    conn.commit()
    reportTableCount(cursor, table, msg="after loading")
    conn.close()
예제 #5
0
def getDatasetFilePaths(datasetName: str, ext: str) -> List[str]:
    """Returns the full paths of all dataset files in project data directory:
    ./light_curve_ml/data/
    :param datasetName - Name of specific data whose individual file paths will
    be returned
    :param ext - Required file extension of dataset files
    """
    path = joinRoot("data", datasetName)
    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(ext)]
예제 #6
0
def connFromParams(dbParams: dict) -> Union[Connection, None]:
    p = joinRoot(dbParams["dbPath"])
    timeout = dbParams["timeout"]
    conn = None
    try:
        conn = sqlite3.connect(p, timeout=timeout)
    except sqlite3.OperationalError:
        logger.exception("Cannot resolve path: %s", p)

    return conn
예제 #7
0
def main():
    paths = [
        "data/macho/macho-sample.csv", "data/ucr_lcs/StarLightCurves_TEST.csv"
    ]
    for path in paths:
        fullPath = joinRoot(path)
        ext = path.split(".")[-1]
        if ext == "csv":
            obj = loadCsv(fullPath)
        elif ext == "json":
            obj = loadJson(fullPath)
        else:
            print("bad ext: " + ext)
            continue

        dumpWhereFound(obj, fullPath, ext)
예제 #8
0
def main():
    np.random.seed(0)
    dataPath = joinRoot("data/rf/breast-cancer-wisconsin.csv")
    dataset = pd.read_csv(dataPath)
    headers = list(dataset)
    description = dataset.describe()
    print("Missing features: %s" % missingFeatures(dataset, description))

    # remove missing data
    dataset = dataset[dataset[headers[6]] != "?"]

    # trim away 'CodeNumber' and 'CancerType' columns
    featureHeaders = dataset[headers[1:-1]]
    targetHeaders = dataset[headers[-1]]
    trainRatio = 0.7
    xTrain, xTest, yTrain, yTest = train_test_split(featureHeaders,
                                                    targetHeaders,
                                                    train_size=trainRatio)

    # Train and Test dataset size details
    print("\nTrain & Test sizes")
    print("Train_x Shape: ", xTrain.shape)
    print("Train_y Shape: ", yTrain.shape)
    print("Test_x Shape: ", xTest.shape)
    print("Test_y Shape: ", yTest.shape)

    model = trainRfClassifier(xTrain, yTrain)
    testPredictions = model.predict(xTest)
    trainPredictions = model.predict(xTrain)

    reportSample = 10
    print("\nSample performance")
    t = PrettyTable(["Predicted", "Actual"])

    # convert the dataframe in to list object the indexes will be in order
    testYList = list(yTest)
    for i in range(0, reportSample):
        t.add_row([testYList[i], testPredictions[i]])

    print(t)

    # accuracy
    print("\nFull performance")
    print("Train accuracy: ", accuracy_score(yTrain, trainPredictions))
    print("Test accuracy: ", accuracy_score(yTest, testPredictions))
    print("Confusion: ", confusion_matrix(yTest, testPredictions))
예제 #9
0
def main():
    """Generates a .csv file containing the labeled MACHO training set.
    Columns of macho-train.csv output:
    0 - macho_uid
    1 - classification
    2 - date_observed
    3 - magnitude
    4 - error

    Additionally generates a second csv file containing the UIDs of missing data
    files.
    """
    inDir = joinRoot("data/macho/class")
    redBands = [
        ",".join([
            "field-tile-seqn-band", "classLabel", "date_observed", "magnitude",
            "error"
        ]) + "\n"
    ]
    blueBands = []

    # N.B. pt1 generated file names of the form:
    # 'field=1_tile=33_seqn=10_class=6.csv'
    pattern = r"""\d+"""
    dataLengths = Counter()

    # Heading for missing UID file
    missing = [",".join(("field", "tile", "seqn")) + "\n"]
    for f in absoluteFilePaths(inDir, ext="csv"):
        try:
            data = np.loadtxt(f, skiprows=1, delimiter=",")
        except ValueError:
            logger.critical("can't load file: %s", f)
            continue

        fileName = f.split("/")[-1].split(".")[0]
        field, tile, seqn, classNum = re.findall(pattern, fileName)
        label = MACHO_NUM_TO_LABEL[classNum]
        prefix = [field, tile, seqn]
        for r in data:
            # column format for source file
            # 0=dateobs, 1=rmag, 2=rerr, 3=bmag, 4=berr

            # uid, class label, dateobs, rmag, rerr
            _rVals = [machoUid(prefix + ["R"]), label
                      ] + [str(_) for _ in r[:3]]

            # uid, class label, dateobs, bmag, berr
            _bVals = ([machoUid(prefix + ["B"]), label] + [str(r[0])] +
                      [str(_) for _ in r[3:]])
            redBands.append(",".join(_rVals) + "\n")
            blueBands.append(",".join(_bVals) + "\n")

        dataLengths[len(data) // 10] += 1  # data length histogram in 10s
        if not len(data):
            missing.append(",".join((field, tile, seqn)) + "\n")

    outDir = joinRoot("data/macho")
    trainFile = os.path.join(outDir, "macho-train.csv")
    with open(trainFile, "w") as f:
        f.writelines(redBands)
        f.writelines(blueBands)

    missingFile = os.path.join(outDir, "macho-train-fails.csv")
    with open(missingFile, "w") as f:
        f.writelines(missing)

    logger.critical("LC length distribution: %s",
                    sorted(list(dataLengths.items())))
예제 #10
0
def main():
    outDir = joinRoot("data/macho/raw")
    commandBase = tapCommandBase()

    returnedLimit = 500000
    limit = int(10e7)
    # testQuery = "SELECT TOP 10 * FROM public.star_view"
    joinQuery = (
        "SELECT TOP %s b.poc, a.fieldid, a.tileid, a.seqn, "
        "a.obsid, a.dateobs, a.rmag, a.rerr, a.bmag, a.berr "
        "FROM public.photometry_view AS a "
        "JOIN public.varstar_view AS b "
        "ON (a.fieldid=b.field AND a.tileid=b.tile AND a.seqn=b.seqn) "
        "WHERE a.fieldid=%s AND b.poc='%s'")

    # Due to a limitation of returning at most 500K records at a time, the data
    # is grabbed across a series of queries for each observation field and for
    # each poc category
    # fields = [1, 2]

    # fields based on data shown at http://macho.nci.org.au/macho_photometry/
    fields = (genList(25, 180) + genList(206, 208) + genList(211, 213) +
              genList(301, 311) + genList(401, 403))
    categoryStart, categoryEnd = 1, 11
    classCounts = defaultdict(int)
    allStart = time.time()
    for field in fields:
        for cat in range(categoryStart, categoryEnd + 1):
            logger.info("Field: %s Class: %s", field, cat)
            outPath = os.path.join(outDir, "c%s_f%s.csv" % (cat, field))
            fullQuery = joinQuery % (limit, field, cat)
            cmd = commandBase + ["adql=" + fullQuery, "out=" + outPath]
            apiStart = time.time()
            try:
                output = subprocess.check_output(cmd)
            except CalledProcessError:
                logger.exception("JAR call failed")
                return

            if logger.isEnabledFor(logging.DEBUG):
                logger.debug("call took: %.01fs", time.time() - apiStart)
                if output:
                    logger.debug("subprocess output: %s",
                                 output.decode("utf-8"))

            # if outfile is empty, print a warning and delete it
            with open(outPath, "r") as outFile:
                outLineCount = sum(1 for _ in outFile)

            classCounts[cat] += outLineCount
            if outLineCount == 1:
                logger.info("Skipping empty result")
                os.remove(outPath)

            if outLineCount >= returnedLimit:
                logger.warning("Reached TAP limit! Data likely missed: %s",
                               outLineCount)

    t = PrettyTable(["Category", "Counts", "Percentage"])
    totalCounts = sum(classCounts.values())
    for cat, counts in sorted(classCounts.items()):
        t.add_row([cat, counts, round(100.0 * counts / totalCounts, 2)])

    # +----------+---------+------------+
    # | Category | Counts | Percentage |
    # +----------+---------+------------+
    # | 1 | 2668376 | 32.29 |
    # | 2 | 612715 | 7.41 |
    # | 3 | 111089 | 1.34 |
    # | 4 | 619357 | 7.49 |
    # | 5 | 318188 | 3.85 |
    # | 6 | 55188 | 0.67 |
    # | 7 | 152359 | 1.84 |
    # | 8 | 352080 | 4.26 |
    # | 9 | 187325 | 2.27 |
    # | 10 | 3048492 | 36.89 |
    # | 11 | 138465 | 1.68 |
    # +----------+---------+------------+

    logger.info(t)
    logger.info("Entire harvest took: %.01fm", (time.time() - allStart) / 60)
예제 #11
0
def tapCommandBase(jreBinaryPath="/usr/bin/java"):
    jarPath = joinRoot("jars/stilts.jar")
    commandBase = [jreBinaryPath, "-jar", jarPath, "tapquery"]
    return commandBase + [
        "tapurl=http://machotap.asvo.nci.org.au/ncitap/tap", "compress=true"
    ]