def read_file(request_files): """Given a submitted_id string, a submitter_label string, and Django request.FILES object with one file, return a request object with a "filename" string and a "bytes" BytesIO.""" if len(request_files.keys()) < 1: return failure("No files in request") if len(request_files.keys()) > 1: return failure("Multiple upload files not allowed") upload_file = list(request_files.values())[0] filename = upload_file.name if not filename.endswith(".xlsx"): return failure("Only .xlsx files are supported at this time.") content = BytesIO() try: for chunk in upload_file.chunks(): content.write(chunk) except Exception as e: return failure("Invalid upload", {"exception": e}) return success({ "filename": filename, "content type": xlsx, "content": content })
def get_assay_header(column): """Given a column name that is an OBI or ONTIE ID (with an optional suffix for stddev, normalized, or qualitative), return the pair of a header dict and an error dict.""" header = None assay_id = column.replace("obi_", "OBI:").replace("ontie_", "ONTIE:") if assay_id in config.labels and config.labels[assay_id] in config.assays: header = config.assays[config.labels[assay_id]].copy() elif assay_id in config.labels and config.labels[ assay_id] in config.parameters: header = config.parameters[config.labels[assay_id]].copy() if header: return header, None root_id = assay_id.replace("_stddev", "").replace("_normalized", "").replace("_qualitative", "") if root_id not in config.labels: return None, failure( f"Unrecognized assay '{root_id}' for column '{column}'") root_label = config.labels[root_id] if root_label in config.assays: header = config.assays[root_label].copy() elif root_label in config.parameters: header = config.parameters[root_label].copy() else: return None, failure( f"Unrecognized assay '{root_id}' for column '{column}'") if column.endswith("_stddev"): header["label"] = f"Standard deviation in {header['units']}" header[ "description"] = f"The standard deviation of the value in '{root_label}'" header.pop("example", None) elif column.endswith("_normalized"): header["label"] = f"{root_label} normalized value" header["type"] = "score 0-1" header[ "description"] = f"The normalized value for '{root_label}' from 0-1" header.pop("example", None) elif column.endswith("_qualitative"): header["label"] = f"{root_label} qualitative value" header["type"] = "text" header["terminology"] = "qualitative_measures" header["description"] = f"The qualitative value for '{root_label}'" header.pop("example", None) else: return None, failure( f"Unrecognized assay suffix for column '{column}'") return header, None
def get_value(scope, dataset, key=None): """Given a scope (staging or secret), a dataset ID, and an optional key, return the value or values in the 'data' key.""" try: return success({"data": datasets.get_value(scope, dataset, key)}) except Exception as e: return failure(e)
def get_assay_headers(dataset_id): """Given dataset ID, return the assay headers.""" dataset_path = get_staging_path(dataset_id) if dataset_id == "spr": path = "examples/spr-dataset.yml" elif not config.staging: raise Exception("CVDB_STAGING directory is not configured") else: path = os.path.join(dataset_path, "dataset.yml") if not os.path.isfile(path): raise Exception(f"File does not exist '{path}'") with open(path, "r") as f: dataset = yaml.load(f, Loader=yaml.SafeLoader) columns = dataset["Columns"] headers = [] terminology_count = 0 for column in columns: header = None if column in config.fields: header = config.fields[column].copy() elif column.startswith("obi_") or column.startswith("ontie_"): header, error = get_assay_header(column) if error: return error if not header: return failure(f"Unrecognized column '{column}'") if not isinstance(header, dict): return failure(f"Error processing column '{column}': {header}") header["value"] = column header["locked"] = True if "terminology" in header and header["terminology"] != "": terms = list(getattr(config, header["terminology"])) col = chr(65 + terminology_count) end = len(terms) + 1 formula = f"=Terminology!${col}$2:${col}${end}" header["terminology"] = terms header["validations"] = [{ "type": "list", "formula1": formula, "allow_blank": True }] terminology_count += 1 headers.append(header) return headers
def set_value(scope, dataset, key, value): """Given a scope (staging or secret), a dataset ID, a key string, and a simle value, that can be represented in YAML, add the key and value to the dataset metadata, maybe overwriting it.""" try: datasets.set_value(scope, dataset, key, value) return success() except Exception as e: return failure(e)
def read_path(path, sheet=None): """Read a TSV or Excel from a path and return a response with a "table" key.""" table = None filename, extension = os.path.splitext(path) extension = extension.lower() if extension == ".xlsx": table = workbooks.read(path, sheet) elif extension == ".tsv": table = tables.read_tsv(path) else: return failure(f"Unsupported input format for '{path}'") return success({"table": table})
def get_secret_value(dataset_id, key=None): """Given a dataset ID and an optional key return the value or values from the dataset secret metadata.""" if key in ["ds_id"]: return failure(f"Key '{key}' cannot be updated") path = os.path.join(config.secret.working_tree_dir, "datasets.tsv") rows = tables.read_tsv(path) for row in rows: if row["ds_id"] == dataset_id: if key: return row[key] else: return row raise Exception(f"No row found for dataset '{dataset_id}'")
def promote(name, email, dataset_id): author = Actor(name, email) # staging if not config.staging: return failure("CVDB_STAGING directory is not configured") staging_dataset_path = os.path.join(config.staging.working_tree_dir, "datasets", str(dataset_id)) paths = [] try: set_staging_value(dataset_id, "Dataset status", "promoted") path = os.path.join(staging_dataset_path, "dataset.yml") paths.append(path) except Exception as e: return failure("Failed to update dataset status", {"exception": e}) try: config.staging.index.add(paths) config.staging.index.commit(f"Promote dataset {dataset_id}", author=author, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) # public if not config.public: return failure("CVDB_PUBLIC directory is not configured") public_dataset_path = os.path.join(config.public.working_tree_dir, "datasets", str(dataset_id)) try: os.makedirs(public_dataset_path) except Exception as e: return failure(f"Could not create '{path}'", {"exception": e}) try: paths = [] for filename in ["dataset.yml", "assays.tsv"]: src = os.path.join(staging_dataset_path, filename) dst = os.path.join(public_dataset_path, filename) shutil.copyfile(src, dst) paths.append(dst) except Exception as e: return failure(f"Could not copy '{src}' to '{dst}'", {"exception": e}) try: config.public.index.add(paths) config.public.index.commit(f"Promote dataset {dataset_id}", author=config.covic, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{public_dataset_path}'", {"exception": e}) print(f"Promoted dataset {dataset_id} from staging to public") return success({"dataset_id": dataset_id})
def set_secret_value(dataset_id, key, value): """Given a dataset ID, key, and value, update the secret `datasets.tsv`.""" if key in ["ds_id"]: return failure(f"Key '{key}' cannot be updated") path = os.path.join(config.secret.working_tree_dir, "datasets.tsv") rows = tables.read_tsv(path) done = False for row in rows: if row["ds_id"] == dataset_id: row[key] = str(value) done = True elif key not in row: row[key] = None if done: tables.write_tsv(rows, path) else: raise Exception(f"No row found for dataset '{dataset_id}'")
def read(source, sheet=None): """Read a source and return a response with a "table" key.""" if tables.is_table(source): return success({"table": source}) if responses.is_response(source): if "table" in source: return success({"table": source["table"]}) else: return failure(f"Response does not have 'table': '{source}'") if isinstance(source, str) or hasattr(source, "read"): return read_path(source, sheet) if requests.is_request(source): response = requests.read_file(source) if failed(response): return response table = workbooks.read(response["content"], sheet) return success({"table": table}) raise Exception(f"Unknown input '{source}'")
def convert(source, destination): """Given a source and a destimation (format or path) convert the table to that format and return a response with a "content" key.""" table = None grid = None if grids.is_grid(source): grid = source else: response = read(source) if failed(response): return response table = response["table"] output_format = destination.lower() if output_format not in ["tsv", "html"]: filename, extension = os.path.splitext(destination) output_format = extension.lower().lstrip(".") if output_format.lower() == "tsv": content = tables.table_to_tsv_string(table) return success({ "table": table, "content type": responses.tsv, "content": content }) elif output_format.lower() == "html": if not grid: grid = grids.table_to_grid(config.prefixes, config.fields, table) html = grids.grid_to_html(grid) content = templates.render_html("templates/grid.html", {"html": html}) return success({ "table": table, "grid": grid, "html": html, "content type": responses.html, "content": content, }) else: return failure(f"Unsupported output format for '{destination}'")
def validate(headers, table): """Given the headers and a (validated!) table, return a response with "grid" and maybe "errors".""" errors = [] rows = [] unique = defaultdict(set) blinded_antibodies = config.read_blinded_antibodies() ab_ids = [x["ab_id"] for x in blinded_antibodies ] + [x["id"] for x in config.ab_controls.values()] ab_labels = [x["ab_id"].replace(":", "-") for x in blinded_antibodies] + list(config.ab_controls.keys()) columns = set() for header in headers: try: columns.add(header["label"]) except KeyError as e: raise Exception(f"Bad header {header}", e) new_table = [] for i in range(0, len(table)): row = table[i] # Skip blank rows values = "" for value in row.values(): values += str(value).strip() if values == "": continue new_table.append(row) extra_columns = set(row.keys()) - columns extra_columns.discard(None) if extra_columns: extra = ", ".join(extra_columns) errors.append(f"Extra columns not allowed: {extra}") missing_columns = columns - set(row.keys()) if missing_columns: missing = ", ".join(missing_columns) errors.append(f"Missing columns: {missing}") newrow = [] for header in headers: column = header["label"] error = None if column not in row: # Should be handled above continue value = str(row[column]).strip() if "field" in header and header["field"] == "ab_id": if value not in ab_ids: error = ( f"'{value}' is not a valid COVIC antibody ID or control antibody ID " + "in column 'Antibody ID'") elif "field" in header and header["field"] == "ab_label": if value.lower() in ["na", "n/a"]: continue if value not in ab_labels: error = ( f"'{value}' is not a valid COVIC antibody label or control antibody label " + "in column 'Antibody label'") elif "required" in header and header["required"] and value == "": error = f"Missing required value in column '{column}'" elif "unique" in header and header["unique"] and value in unique[ column]: error = f"Duplicate value '{value}' is not allowed in column '{column}'" elif "terminology" in header and value != "" and value not in header[ "terminology"]: error = f"'{value}' is not a valid term in column '{column}'" elif "type" in header and value != "": error = validate_field(column, header["type"], value) if "unique" in header and header["unique"]: unique[column].add(value) cell = None if error: cell = grids.error_cell(value, error) errors.append("Error in row {0}: {1}".format(i + 2, error)) else: cell = grids.value_cell(value) newrow.append(cell) rows.append(newrow) table = new_table grid = {"headers": [headers], "rows": rows} unique_errors = [] for error in errors: if error not in unique_errors: unique_errors.append(error) errors = unique_errors error_count = len(errors) if error_count > 0: return failure( f"There were {error_count} errors", { "errors": errors, "table": table, "grid": grid }, ) return success({"table": table, "grid": grid})
def submit(name, email, organization, table): """Given a new table of antibodies: 1. validate it 2. assign IDs and append them to the secrets, 3. append the blinded antibodies to the staging table, 4. return a response with merged IDs.""" response = validate(table) if failed(response): return response table = response["table"] # blank rows removed if not config.secret: return failure("CVDB_SECRET directory is not configured") secret = [] path = os.path.join(config.secret.working_tree_dir, "antibodies.tsv") if os.path.isfile(path): secret = tables.read_tsv(path) blind = config.read_blinded_antibodies() if len(secret) != len(blind): return failure(f"Different number of antibody rows: {len(secret)} != {len(blind)}") current_id = "COVIC:0" if len(blind) > 0: current_id = blind[-1]["ab_id"] submission = [] for row in table: current_id = names.increment_id(current_id) # secrets: write this to the secret repo secret_row = OrderedDict() secret_row["ab_id"] = current_id secret_row["ab_name"] = row["Antibody name"] secret_row["ab_details"] = row["Antibody details"] secret_row["ab_comment"] = row["Antibody comment"] secret_row["org_name"] = organization secret_row["submitter_email"] = email secret.append(secret_row) # blind: write this to staging/public repos blind_row = OrderedDict() blind_row["ab_id"] = current_id # submission: return this to the submitter submission_row = OrderedDict() submission_row["ab_id"] = current_id submission_row["ab_name"] = row["Antibody name"] # for each header, add cells to blind and submission for header in headers[1:]: column = header["value"] value = row[header["label"]] if column.endswith("_label"): i = config.ids.get(value, "") blind_row[column.replace("_label", "_id")] = i submission_row[column.replace("_label", "_id")] = i submission_row[column] = value else: blind_row[column] = value submission_row[column] = value blind.append(blind_row) submission.append(submission_row) author = Actor(name, email) # secret try: path = os.path.join(config.secret.working_tree_dir, "antibodies.tsv") tables.write_tsv(secret, path) except Exception as e: return failure(f"Failed to write '{path}'", {"exception": e}) try: config.secret.index.add([path]) config.secret.index.commit("Submit antibodies", author=author, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) # staging try: path = os.path.join(config.staging.working_tree_dir, "antibodies.tsv") tables.write_tsv(blind, path) except Exception as e: return failure(f"Failed to write '{path}'", {"exception": e}) try: config.staging.index.add([path]) config.staging.index.commit("Submit antibodies", author=author, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) # public if not config.public: return failure("CVDB_PUBLIC directory is not configured") try: path = os.path.join(config.public.working_tree_dir, "antibodies.tsv") tables.write_tsv(blind, path) except Exception as e: return failure(f"Failed to write '{path}'", {"exception": e}) try: config.public.index.add([path]) config.public.index.commit("Submit antibodies", author=config.covic, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) grid = grids.table_to_grid(config.prefixes, config.fields, submission) print("Submitted antibodies") return success({"table": submission, "grid": grid})
def submit(name, email, dataset_id, table): """Given a dataset ID and a new table of assays, validate it, save it to staging, and commit.""" response = validate(dataset_id, table) if failed(response): return response table = response["table"] # remove blank rows ab_ids = {} for ab in config.read_blinded_antibodies(): ab_id = ab["ab_id"] ab_label = ab_id.replace(":", "-") ab_ids[ab_label] = ab_id for row in config.ab_controls.values(): ab_ids[row["label"]] = row["id"] assay_headers = get_assay_headers(dataset_id) assays = [] for row in table: assay = OrderedDict() for header in assay_headers: value = header["value"] label = header["label"] if value == "ab_label": row[label] = row[label].strip() assay["ab_id"] = ab_ids[row[label]] else: assay[value] = row[label] assays.append(assay) author = Actor(name, email) # staging if not config.staging: return failure("CVDB_STAGING directory is not configured") dataset_path = os.path.join(config.staging.working_tree_dir, "datasets", str(dataset_id)) paths = [] try: set_staging_value(dataset_id, "Dataset status", "submitted") path = os.path.join(dataset_path, "dataset.yml") paths.append(path) except Exception as e: return failure("Failed to update dataset status", {"exception": e}) try: path = os.path.join(dataset_path, "assays.tsv") tables.write_tsv(assays, path) paths.append(path) except Exception as e: return failure(f"Failed to write '{path}'", {"exception": e}) try: config.staging.index.add(paths) config.staging.index.commit( f"Submit assays to dataset {dataset_id}", author=author, committer=config.covic, ) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) grid = grids.table_to_grid(config.prefixes, config.fields, table) print(f"Submitted assays to dataset {dataset_id}") return success({"table": table, "grid": grid, "dataset_id": dataset_id})
def create(name, email, columns=[]): if not config.staging: return failure("CVDB_STAGING directory is not configured") for column in columns: if column in config.fields: continue if column.startswith("obi_") or column.startswith("ontie_"): assay_id = column.replace("obi_", "OBI:").replace("ontie_", "ONTIE:") root_id = (assay_id.replace("_stddev", "").replace( "_normalized", "").replace("_qualitative", "")) if assay_id in config.labels: continue if root_id in config.labels: if column.endswith("_stddev"): continue if column.endswith("_normalized"): continue if column.endswith("_qualitative"): continue return failure(f"Unrecognized column '{column}'") datasets_path = os.path.join(config.staging.working_tree_dir, "datasets") current_id = 0 if not os.path.exists(datasets_path): os.makedirs(datasets_path) if not os.path.isdir(datasets_path): return failure(f"'{datasets_path}' is not a directory") for root, dirs, files in os.walk(datasets_path): for name in dirs: if re.match(r"\d+", name): current_id = max(current_id, int(name)) dataset_id = current_id + 1 author = Actor(name, email) # secret try: path = os.path.join(config.secret.working_tree_dir, "datasets.tsv") datasets = [] if os.path.isfile(path): datasets = tables.read_tsv(path) datasets.append( OrderedDict({ "ds_id": dataset_id, "submitter_email": email })) tables.write_tsv(datasets, path) except Exception as e: return failure(f"Failed to update '{path}'", {"exception": e}) try: config.secret.index.add([path]) config.secret.index.commit(f"Create dataset {dataset_id}", author=author, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) # staging try: dataset_path = os.path.join(datasets_path, str(dataset_id)) os.mkdir(dataset_path) except Exception as e: return failure(f"Failed to create '{dataset_path}'", {"exception": e}) try: dataset = { "Dataset ID": f"ds:{dataset_id}", "Dataset status": "configured", "Columns": columns, } path = os.path.join(dataset_path, "dataset.yml") with open(path, "w") as outfile: yaml.dump(dataset, outfile, sort_keys=False) except Exception as e: return failure(f"Failed to write '{path}'", {"exception": e}) try: config.staging.index.add([path]) config.staging.index.commit(f"Create dataset {dataset_id}", author=author, committer=config.covic) except Exception as e: return failure(f"Failed to commit '{path}'", {"exception": e}) print(f"Created dataset {dataset_id}") return success({"dataset_id": dataset_id})