def main():
    try:
        args = Args()

        # Create dataset
        ds = Dataset(dataset=args.dataset_path,
                     name=args.dataset_name,
                     package_owner=args.package_owner,
                     readme_path=args.readme_path)

        # Handle optional provided
        if args.usage_doc_or_link:
            ds.add_usage_doc(args.usage_doc_or_link)
        if args.license_doc_or_link:
            ds.add_license(args.license_doc_or_link)
        if args.metadata_columns:
            ds.set_metadata_columns(args.metadata_columns)
        if args.path_columns:
            ds.set_path_columns(args.path_columns)

        # Distribute
        pkg = ds.distribute(push_uri=args.push_uri, message=args.message)
        log.info(
            f"Completed distribution. "
            f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]"
        )

    except Exception as e:
        log.error("=============================================")
        if args.debug:
            log.error("\n\n" + traceback.format_exc())
            log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)
示例#2
0
def distribute_cellprofiler_features(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/features2quilt.csv",
    dataset_name="2d_autocontrasted_single_cell_features_actn2_2",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):
    df = pd.read_csv(csv_loc)

    # subsample features to make test
    if test:
        # write test feature csv and test image counts csv
        make_test_csv(csv_loc=csv_loc)
        cell_line = df["cell_line"][0]
        cellprofiler_id = df["cellprofiler_id"][0]

        # make test manifest
        df = pd.DataFrame({
            "feature_file": ["cp_features_test.csv"],
            "image_object_count_file": ["image_object_counts_test.csv"],
            "cell_line": [cell_line],
            "cellprofiler_id": [cellprofiler_id],
        })

        dataset_name = f"{dataset_name}_test"

    # Create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/README.md",
    )

    # Optionally add common additional requirements
    ds.add_usage_doc(
        "https://docs.quiltdata.com/walkthrough/reading-from-a-package")
    ds.add_license("https://www.allencell.org/terms-of-use.html")

    # Optionally indicate column values to use for file metadata
    ds.set_metadata_columns(["cell_line", "cellprofiler_id"])

    # Optionally rename the columns on the package level
    ds.set_column_names_map({
        "feature_file": "features",
        "image_object_count_file": "object_counts"
    })

    # add commit hash to message
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    # Distribute
    ds.distribute(push_uri=s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
示例#3
0
                right_on="CellId",
                suffixes=("_raw", "_fe_link"))

# Step 3:
# Validate and prune the raw data
# We shouldn't lose any rows here but we are doing this as a safety measure
cleaned = validate(raw, drop_on_error=True)
print(f"Dropped {len(raw) - len(cleaned.data)} rows during validation.")

# Step 4:
# Send to dataset object for package construction
ds = Dataset(cleaned.data, "Pipeline Integrated Cell", "aics", "readme.md")

# Step 5:
# Add a license
ds.add_license("https://www.allencell.org/terms-of-use.html")

# Indicate column values to use for file metadata
ds.set_metadata_columns([
    "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm",
    "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId",
    "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow",
    "FeatureExplorerURL"
])

# Set produced package directory naming
ds.set_column_names_map({
    "MembraneContourReadPath":
    "membrane_contours",
    "MembraneSegmentationReadPath":
    "membrane_segmentations",