def remove_low_counts(data, method="sum", threshold=50, target_identifier=""): params = cc.CleanParameters("umc") params.min_sum_count = threshold report = cc.clean_gene_count_file(data, params) dataset = dr.create_data_set(report.dataset_identifier, "genes low counts removed", report.dataset_location) print(dataset.added_on) return dataset
def remove_non_expressed_genes(data, source_identifier, custom_prefix="no_outliers_"): new_set = data[-(data.iloc[:, 1:].sum(1) == 0)] target_name = custom_prefix + source_identifier location = APP_CONFIG["application_base_location"] + target_name + ".txt" created = dr.create_data_set(identifier=target_name, type=dr.ACTION_TYPES["zcounts"], location=location) new_set.to_csv(location, index=False, sep="\t") return created
def normalize_gene_counts(data, method, threshold=None, target_identifier="", experiment_identifier=""): # if exists -> get from db processing_result = cn.bioconductor_normalization(data, method) new_intern_identifier = identifier_generator.get_generated_guid_as_string() hdf_storage.store_hdf(processing_result.frame, new_intern_identifier) print("compressed pytable storage done") package = mr.get_package_by_name_and_version(processing_result.package, processing_result.version).public_identifier dataset = dr.create_data_set(new_intern_identifier, package + "_" + target_identifier, "genes normalized dataset", package_identifier=package, experiment_identifier=experiment_identifier) print("new dataset saved") #er.link_dataset_to_experiment(experiment_identifier, dataset.public_identifier) print("new dataset linked") return dataset
def post(self): args = dataset_parser.parse_args() print("dataset post received") # upload new dataset if args.dataset_identifier is None: print("creating new data set") new_file = args.file filename = secure_filename(new_file.filename) print("uploaded" + filename) intern_identifier = ig.get_generated_guid_as_string() new_file.save(filename) try: with open(filename, "rb") as fl: frame = dr.get_data_frame_from_csv(fl) intern_location = dr.store_data_frame_to_hdf(frame, intern_identifier) # raw counts by default data_entity = dr.create_data_set(intern_identifier, public_identifier=filename, dataset_type="raw gene counts", experiment_identifier="raw_data_container") # For demo -> add exp identifier server_hash = ig.md5_for_file(fl) print(filename + " is saved") print("file removed") return cr.JsonResource( {"filename": filename, "intern_identifier": data_entity.intern_identifier, "public_identifier": data_entity.public_identifier, "server_md5": server_hash}), 201 except IntegrityError: return cr.StringApiResource("Public identifier already taken"), 409 except: return cr.StringApiResource("An error has occured, check if your data set comply with the expected format"), 400 finally: if filename is not None: os.remove(filename) else: # a new dataset is created based on source try: print("pre-processing existing dataset") source_data = args.dataset_identifier print("source data : " + source_data) # target_data = args.target_dataset_identifier # TODO link to experiment identifier! # experiment_identifier = args.experiment_identifier method_identifier = args.preprocessing_method_identifier print("source " + source_data + "method " + method_identifier) path = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"] df = dr.get_data_frame_from_hdf(source_data, path) print("data frame is loaded") new_data_set = ed.normalize_gene_counts(df, method_identifier, target_identifier=source_data, experiment_identifier=args.experiment_identifier) return cr.JsonResource(eto.SummaryDatasetView(new_data_set).to_json()), 201 except Exception as e: print(e.__str__()) return cr.StringApiResource("Explosion! Tool down..."), 400