def get_all_records_columns_server(self, sort_by='_id', sort_direction=-1, projection=dict(), filter_by=dict(), search_term=str(), limit=0, skip=0): filter_by["deleted"] = data_utils.get_not_deleted_flag() # 'name' seems to be the only reasonable field to restrict searching; others fields are resolved filter_by["name"] = {'$regex': search_term, "$options": 'i'} if self.profile_id: filter_by["profile_id"] = self.profile_id if skip > 0: records = self.get_collection_handle().find( filter_by, projection).sort([[sort_by, sort_direction]]).skip(skip).limit(limit) else: records = self.get_collection_handle().find( filter_by, projection).sort([[sort_by, sort_direction]]).limit(limit) return cursor_to_list(records)
def get_counts(self): """ Method to return current numbers of Publication, Person, Data, Sample and Submission objects in the given profile :return: Dictionary containing the data """ num_dict = dict(num_pub="publication", num_person="person", num_data="datafile", num_sample="sample", num_submission="submission", num_annotation="annotation") status = dict() for k, v in num_dict.items(): if handle_dict.get(v, None): status[k] = handle_dict.get(v).count({ 'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag() }) return status
def get_all_records(self, sort_by='_id', sort_direction=-1, **kwargs): doc = dict(deleted=data_utils.get_not_deleted_flag()) if self.profile_id: doc["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find(doc).sort( [[sort_by, sort_direction]]))
def get_by_file_id(self, file_id=None): docs = None if file_id: docs = self.get_collection_handle().find_one( {"file_id": file_id, "deleted": data_utils.get_not_deleted_flag()}) return docs
def source_count(self): return handle_dict.get("source").count({ 'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag() })
def get_by_file_name_id(self, file_id=None): docs = None if file_id: docs = self.get_collection_handle().find_one( { "_id": ObjectId(file_id), "deleted": data_utils.get_not_deleted_flag() }, {"name": 1}) return docs
def get_all_records_columns(self, sort_by='_id', sort_direction=-1, projection=dict(), filter_by=dict()): filter_by["deleted"] = data_utils.get_not_deleted_flag() if self.profile_id: filter_by["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find( filter_by, projection).sort([[sort_by, sort_direction]]))
def get_for_user(self, user=None): if not user: user = ThreadLocal.get_current_user().id docs = self.get_collection_handle().find({"user_id": user, "deleted": data_utils.get_not_deleted_flag()}).sort( [['_id', -1]]) if docs: return docs else: return None
def get_component_count(self): count = 0 if self.get_collection_handle(): count = self.get_collection_handle().count({ 'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag() }) return count
def save_record(self, auto_fields=dict(), **kwargs): fields = dict() # set auto fields if auto_fields: fields = DecoupleFormSubmission(auto_fields, self.get_schema().get("schema")).get_schema_fields_updated() # should have target_id for updates and return empty string for inserts target_id = kwargs.pop("target_id", str()) # set system fields system_fields = dict( date_modified=data_utils.get_datetime(), deleted=data_utils.get_not_deleted_flag() ) if not target_id: system_fields["date_created"] = data_utils.get_datetime() system_fields["profile_id"] = self.profile_id # extend system fields for k, v in kwargs.items(): system_fields[k] = v # add system fields to 'fields' and set default values - insert mode only for f in self.get_schema().get("schema"): f_id = f.id.split(".")[-1] if f_id in system_fields: fields[f_id] = system_fields.get(f_id) if not target_id and f_id not in fields: fields[f_id] = data_utils.default_jsontype(f.type) # if True, then the database action (to save/update) is never performed, but validated 'fields' is returned validate_only = kwargs.pop("validate_only", False) # prefer this testto save guard against all sorts of value the 'validate_only' can assume if validate_only == True: return fields else: if target_id: self.get_collection_handle().update( {"_id": ObjectId(target_id)}, {'$set': fields}) else: doc = self.get_collection_handle().insert(fields) target_id = str(doc) # return saved record rec = self.get_record(target_id) return rec
def save_record(self, auto_fields=dict(), **kwargs): fields = dict() schema = kwargs.get("schema", list()) or self.get_component_schema() # set auto fields if auto_fields: fields = DecoupleFormSubmission( auto_fields, schema).get_schema_fields_updated_dict() # should have target_id for updates and return empty string for inserts target_id = kwargs.pop("target_id", str()) # set system fields system_fields = dict(date_modified=data_utils.get_datetime(), deleted=data_utils.get_not_deleted_flag()) if not target_id: system_fields["date_created"] = data_utils.get_datetime() system_fields["profile_id"] = self.profile_id # extend system fields for k, v in kwargs.items(): system_fields[k] = v # add system fields to 'fields' and set default values - insert mode only for f in schema: f_id = f["id"].split(".")[-1] if f_id in system_fields: fields[f_id] = system_fields.get(f_id) if not target_id and f_id not in fields: fields[f_id] = data_utils.default_jsontype(f["type"]) # if True, then the database action (to save/update) is never performed, but validated 'fields' are returned validate_only = kwargs.pop("validate_only", False) if validate_only is True: return fields else: if target_id: self.get_collection_handle().update( {"_id": ObjectId(target_id)}, {'$set': fields}) else: doc = self.get_collection_handle().insert(fields) target_id = str(doc) # return saved record rec = self.get_record(target_id) return rec
def get_for_user(self, user=None): if not user: user = data_utils.get_current_user().id docs = self.get_collection_handle().find({ "user_id": user, "deleted": data_utils.get_not_deleted_flag() }).sort([['_id', -1]]) if docs: return docs else: return None
def get_counts(self): """ Method to return current numbers of Publication, Person, Data, Sample and Submission objects in the given profile :return: Dictionary containing the data """ num_dict = dict(num_pub="publication", num_person="person", num_data="datafile", num_sample="sample", num_submission="submission", num_annotation="annotation" ) status = dict() for k, v in num_dict.items(): if handle_dict.get(v, None): status[k] = handle_dict.get(v).count( {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()}) return status
def get_shared_for_user(self, user=None): # get profiles shared with user if not user: user = data_utils.get_current_user().id groups = CopoGroup().Group.find({'member_ids': str(user)}) p_list = list() for g in groups: gp = dict(g) p_list.extend(gp['shared_profile_ids']) # remove duplicates # p_list = list(set(p_list)) docs = self.get_collection_handle().find({ "_id": { "$in": p_list }, "deleted": data_utils.get_not_deleted_flag() }) out = list(docs) for d in out: d['shared'] = True return out
def get_all_records(self): doc = {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()} return cursor_to_list(self.RemoteFileCollection.find(doc))
def get_all_records(self): doc = { 'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag() } return cursor_to_list(self.RemoteFileCollection.find(doc))
def get_by_datafile(self, datafile_id): doc = { 'datafile_id': ObjectId(datafile_id), 'deleted': data_utils.get_not_deleted_flag() } return cursor_to_list(self.RemoteFileCollection.find(doc))
def get_all_records(self, sort_by='_id', sort_direction=-1): doc = dict(deleted=data_utils.get_not_deleted_flag()) if self.profile_id: doc["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find(doc).sort([[sort_by, sort_direction]]))
def get_by_datafile(self, datafile_id): doc = {'datafile_id': ObjectId(datafile_id), 'deleted': data_utils.get_not_deleted_flag()} return cursor_to_list(self.RemoteFileCollection.find(doc))
def get_sra_samples(self, submission_location=str()): """ function retrieves study samples and presents them in a format for building an sra sample set :param submission_location: :return: """ sra_samples = list() # get datafiles datafiles = cursor_to_list(ghlper.get_datafiles_handle().find( { "description_token": self.description_token, 'deleted': data_utils.get_not_deleted_flag() }, { '_id': 1, 'file_location': 1, "description.attributes": 1, "name": 1, "file_hash": 1 })) if not len(datafiles): self.__converter_errors.append("No datafiles found in submission!") return sra_samples df = pd.DataFrame(datafiles) df['file_id'] = df._id.astype(str) df['file_path'] = df['file_location'].fillna('') df['upload_status'] = False df = df[['file_id', 'file_path', 'upload_status']] bundle = list(df.file_id) bundle_meta = df.to_dict('records') submission_record = dict(bundle=bundle, bundle_meta=bundle_meta) ghlper.get_submission_handle().update( {"_id": ObjectId(self.submission_id)}, {'$set': submission_record}) samples_id = list() df_attributes = [] # datafiles attributes for datafile in datafiles: datafile_attributes = [ v for k, v in datafile.get("description", dict()).get( "attributes", dict()).items() ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) new_dict['datafile_id'] = str(datafile['_id']) new_dict['datafile_name'] = datafile.get('name', str()) new_dict['datafile_hash'] = datafile.get('file_hash', str()) new_dict['datafile_location'] = datafile.get( 'file_location', str()) df_attributes.append(new_dict) # process datafiles attributes df_attributes_df = pd.DataFrame(df_attributes) df_columns = df_attributes_df.columns # replace null values for k in df_columns: df_attributes_df[k].fillna('', inplace=True) if 'study_samples' in df_columns: df_attributes_df['study_samples'] = df_attributes_df[ 'study_samples'].apply(lambda x: x[0] if isinstance(x, list) else x.split(",")[-1]) samples_id = list(df_attributes_df['study_samples'].unique()) samples_id = [x for x in samples_id if x] if not samples_id: self.__converter_errors.append( "No samples associated with datafiles!") return sra_samples file_path = os.path.join(submission_location, "datafiles.csv") df_attributes_df.to_csv(path_or_buf=file_path, index=False) samples_id_object_list = [ ObjectId(sample_id) for sample_id in samples_id ] sample_records = ghlper.get_samples_handle().find( {"_id": { "$in": samples_id_object_list }}) # get sources sources = ghlper.get_sources_handle().find({ "profile_id": self.profile_id, 'deleted': data_utils.get_not_deleted_flag() }) sra_sources = dict() for source in sources: sra_source = dict() sra_sources[str(source["_id"])] = sra_source sra_source["name"] = source["name"] sra_source["taxon_id"] = source.get("organism", dict()).get( 'termAccession', str()) if 'NCBITaxon_' in sra_source["taxon_id"]: sra_source["taxon_id"] = sra_source["taxon_id"].split( 'NCBITaxon_')[-1] sra_source["scientific_name"] = source.get("organism", dict()).get( 'annotationValue', str()) sra_source['attributes'] = self.get_attributes( source.get("characteristics", list())) sra_source[ 'attributes'] = sra_source['attributes'] + self.get_attributes( source.get("factorValues", list())) for sample in sample_records: sra_sample = dict() sra_sample['sample_id'] = str(sample['_id']) sra_sample['name'] = sample['name'] sra_sample['attributes'] = self.get_attributes( sample.get("characteristics", list())) sra_sample[ 'attributes'] = sra_sample['attributes'] + self.get_attributes( sample.get("factorValues", list())) # retrieve sample source source_id = sample.get("derivesFrom", list()) source_id = source_id[0] if source_id else '' sample_source = sra_sources.get(source_id, dict()) if sample_source: sra_sample['attributes'].append( dict(tag="Source Name", value=sample_source.get("name", str()))) else: self.__converter_errors.append("Sample: " + sample['name'] + " has no source information") if sample_source.get("taxon_id", str()): sra_sample['taxon_id'] = sample_source.get("taxon_id", str()) else: self.__converter_errors.append( "Sample: " + sample['name'] + " has no TAXON_ID. Please make sure an organism has " "been set for the source of this sample from the NCBITAXON ontology." ) if sample_source.get("scientific_name", str()): sra_sample['scientific_name'] = sample_source.get( "scientific_name", str()) else: self.__converter_errors.append( "Sample: " + sample['name'] + " has no SCIENTIFIC_NAME. Please make sure an organism has " "been set for the source of this sample from an ontology.") if sample_source.get("attributes", list()): sra_sample['attributes'] = sra_sample[ 'attributes'] + sample_source.get("attributes", list()) sra_samples.append(sra_sample) return sra_samples
def generate_server_side_table_records(profile_id=str(), component=str(), request=dict()): # function generates component records for building an UI table using server-side processing # - please note that for effective data display, # all array and object-type fields (e.g., characteristics) are deferred to sub-table display. # please define such in the schema as "show_in_table": false and "show_as_attribute": true data_set = list() n_size = int(request.get("length", 10)) # assumes 10 records per page if length not set draw = int(request.get("draw", 1)) start = int(request.get("start", 0)) # instantiate data access object da_object = DAComponent(profile_id, component) return_dict = dict() records_total = da_object.get_collection_handle().count( {'profile_id': profile_id, 'deleted': data_utils.get_not_deleted_flag()}) # retrieve and process records filter_by = dict() if component == "datafile": # get all active bundles in the profile existing_bundles = Description().get_all_records_columns(projection=dict(_id=1), filter_by=dict(profile_id=profile_id, component=component)) existing_bundles = [str(x["_id"]) for x in existing_bundles] records_total = da_object.get_collection_handle().count({"$and": [ {"profile_id": profile_id, 'deleted': data_utils.get_not_deleted_flag()}, {"$or": [ {"description_token": {"$in": [None, False, ""]}}, {"description_token": {"$nin": existing_bundles}}]} ]}) filter_by = {"$or": [ {"description_token": {"$in": [None, False, ""]}}, {"description_token": {"$nin": existing_bundles}}]} # get and filter schema elements based on displayable columns schema = [x for x in da_object.get_schema().get("schema_dict") if x.get("show_in_table", True)] # build db column projection projection = [(x["id"].split(".")[-1], 1) for x in schema] # order by sort_by = request.get('order[0][column]', '0') sort_by = request.get('columns[' + sort_by + '][data]', '') sort_direction = request.get('order[0][dir]', 'asc') sort_by = '_id' if not sort_by else sort_by sort_direction = 1 if sort_direction == 'asc' else -1 # search search_term = request.get('search[value]', '').strip() records = da_object.get_all_records_columns_server(sort_by=sort_by, sort_direction=sort_direction, search_term=search_term, projection=dict(projection), limit=n_size, skip=start, filter_by=filter_by) records_filtered = records_total if search_term: records_filtered = da_object.get_collection_handle().count( {'profile_id': profile_id, 'deleted': data_utils.get_not_deleted_flag(), 'name': {'$regex': search_term, "$options": 'i'}}) if records: df = pd.DataFrame(records) df['record_id'] = df._id.astype(str) df["DT_RowId"] = df.record_id df.DT_RowId = 'row_' + df.DT_RowId df = df.drop('_id', axis='columns') for x in schema: x["id"] = x["id"].split(".")[-1] df[x["id"]] = df[x["id"]].apply(resolve_control_output_apply, args=(x,)).astype(str) data_set = df.to_dict('records') return_dict["records_total"] = records_total return_dict["records_filtered"] = records_filtered return_dict["data_set"] = data_set return_dict["draw"] = draw return return_dict
def source_count(self): return handle_dict.get("source").count( {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()})
def perform_datafile_pairing(self, next_stage_index): """ stage callback function: determines if the pairing of datafiles should be performed given the 'library_layout' :param next_stage_index: :return: """ description = Description().GET(self.__wzh.description_token) stages = description["stages"] attributes = description["attributes"] meta = description.get("meta", dict()) # validate stage stage = dict() if next_stage_index < len(stages): stage = stages[next_stage_index] # first, target repository relevant_repos = [ "ena" ] # add a repo to this list if it requires datafile pairing target_repository = attributes.get("target_repository", dict()).get("deposition_context", str()) if target_repository not in relevant_repos: # no items to pair, clear any previous pairing information self.remove_pairing_info(stage["ref"], attributes, meta) return False # get records in bundle records = cursor_to_list(DataFile().get_collection_handle().find( { "$and": [{ "description_token": self.__wzh.description_token, 'deleted': d_utils.get_not_deleted_flag() }, { 'description.attributes': { "$exists": True } }] }, { 'description.attributes': 1, 'name': 1 })) if not records: # no items to pair, clear any previous pairing information self.remove_pairing_info(stage["ref"], attributes, meta) return False for rec in records: datafile_attributes = [ v for k, v in rec['description'].get('attributes', dict()).items() ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) rec['attributes'] = new_dict rec['pairing'] = rec['attributes'].get('library_layout', '').upper() df = pd.DataFrame(records) df._id = df['_id'].astype(str) df.index = df._id df = df[df['pairing'] == 'PAIRED'] if not len(df): # no items to pair, clear any previous pairing information self.remove_pairing_info(stage["ref"], attributes, meta) return False # remove extraneous columns df = df.drop(columns=['description']) if not len(df) % 2 == 0: stage["error"] = "Pairing requires even number of datafiles!" stage["refresh_wizard"] = True else: # get previously pairing candidates paired_candidates_old = meta.get( stage["ref"] + "_paired_candidates", list()) paired_candidates = list(df.index) paired_candidates_old.sort() paired_candidates.sort() if not paired_candidates_old == paired_candidates: stage["refresh_wizard"] = True # if there's a valid stored map, use it stage_data = list() saved_copy = attributes.get(stage["ref"], list()) if saved_copy: stored_pairs_df = pd.DataFrame(saved_copy) stored_pairs_list = list(stored_pairs_df._id) + list( stored_pairs_df._id2) stored_pairs_list.sort() if stored_pairs_list == paired_candidates: df_dict = df.to_dict() df_dict = df_dict["name"] stored_pairs_df["name"] = stored_pairs_df['_id'].apply( lambda x: str(df_dict[x])) stored_pairs_df["name2"] = stored_pairs_df['_id2'].apply( lambda x: str(df_dict[x])) df_result = stored_pairs_df[['name', 'name2']] df_result.columns = ['file1', 'file2'] stage_data = df_result.to_dict('records') if not stage_data: # define fresh pairing map # sort by file name to reflect pairing df = df.sort_values(by=['name']) s_even = df._id.iloc[1::2] s_odd = df._id.iloc[::2] df_odd = df[df.index.isin(s_odd)].copy() df_even = df[df.index.isin(s_even)].copy() df_even['_id2'] = df_even['_id'] df_even['name2'] = df_even['name'] df_even = df_even[['_id2', 'name2']] df_odd = df_odd[['_id', 'name']] df_odd.index = range(0, len(df_odd)) df_even.index = range(0, len(df_even)) df_result = pd.concat([df_odd, df_even], axis=1).reindex(df_odd.index) saved_copy = df_result[['_id', '_id2']].to_dict('records') df_result = df_result[['name', 'name2']] df_result.columns = ['file1', 'file2'] stage_data = df_result.to_dict('records') stage["data"] = stage_data # save state attributes[stage["ref"]] = saved_copy meta[stage["ref"] + "_paired_candidates"] = paired_candidates save_dict = dict(attributes=attributes, meta=meta) Description().edit_description(self.__wzh.description_token, save_dict) stage["message"] = self.__wzh.wiz_message[ "datafiles_pairing_message"]["text"] return stage