def CreateMetadata(pd_seq_list,metadata_destination,extract_all_samples,pd_envoi_qenome_quebec,pd_sgil_extract): MySQLcovid19.SetConnection() seq_list = BuildSeqList(pd_seq_list) pd_metadata = MySQLcovid19Selector.GetMetadataAsPdDataFrame(MySQLcovid19.GetConnection(),seq_list,metadata_destination,extract_all_samples) pd_metadata['sample'] = pd_metadata['sample'].str.replace('LSPQ-','') pd_missing_spec = CheckMissingSpec(pd_metadata,seq_list) if (pd_envoi_qenome_quebec is not None) and (pd_sgil_extract is not None): pd_missing_get_from_sgil_extract = AddMissingFromSgilExtract(pd_missing_spec,pd_sgil_extract,pd_metadata.columns) print(pd_missing_get_from_sgil_extract) pd_metadata = pd.concat([pd_metadata,pd_missing_get_from_sgil_extract]) pd_metadata['sample'] = pd_metadata['sample'].str.replace('LSPQ-','') pd_missing_spec = CheckMissingSpec(pd_metadata,seq_list) pd_missing_get_from_EnvoisGenomeQuebec = AddMissingFromEnvoisGenomeQuebec(pd_missing_spec,pd_envoi_qenome_quebec,pd_metadata.columns) pd_metadata = pd.concat([pd_metadata,pd_missing_get_from_EnvoisGenomeQuebec]) pd_metadata['sample'] = pd_metadata['sample'].str.replace('LSPQ-','') pd_missing_spec = CheckMissingSpec(pd_metadata,seq_list) pd_metadata['sample_date'] = pd.to_datetime(pd_metadata.sample_date) pd_metadata['sample_date'] = pd_metadata['sample_date'].dt.strftime('%Y-%m-%d') pd_metadata = pd_metadata.drop_duplicates(subset='sample',keep='first') #print(pd_metadata) return([pd_metadata,pd_missing_spec])
def Main(): MySQLcovid19.SetConnection() global basedir basedir = PlateDirManager.GetBaseDir(_DEBUG) plate_manager = PlateManager() for plate in os.listdir(basedir): plate_manager.AddPlate(plate) BuildSeqReports(plate_manager)
def GetQcDataframeFromDSPdb(id_list): MySQLcovid19.SetConnection() pd_df = MySQLcovid19Selector.GetMetadataAsPdDataFrame( MySQLcovid19.GetConnection(), id_list) return (pd_df)
def GetSampleDate(self): sample_date = MySQLcovid19Selector.GetSampleDate( MySQLcovid19.GetCursor(), self.sample.GetSampleName()) return (sample_date)
def CreateMetadataForAllDspDbSamples(metadata_destination): MySQLcovid19.SetConnection() pd_metadata = MySQLcovid19Selector.GetMetadataAsPdDataFrame(MySQLcovid19.GetConnection(),[""],metadata_destination,True) return(pd_metadata)
def GetMetadataDfFromCovBank(id_list): MySQLcovid19.SetConnection() pd_df = MySQLcovid19Selector.GetMetadataAsPdDataFrame( MySQLcovid19.GetConnection(), id_list) return (pd_df)
def CreateMetadata(self, max_sample_date, tolerated_rej_samples): MySQLcovid19.SetConnection() year_2020 = datetime.datetime.strptime("2020", "%Y") sgil_corrected_samples_list = list( map(self.GetSGILfoldernoFromOrdno, self.samples_list)) upper_corrected_samples_list = list( map(self.GetUpperCorrectedSamplesList, self.samples_list)) zip_list = list(zip(upper_corrected_samples_list, self.samples_list)) zip_list.sort() self.mysqlId_to_belugalistId = dict(zip_list) self.samples_list = [x[1] for x in zip_list] self.pd_metadata = MySQLcovid19Selector.GetMetadataAsPdDataFrame( MySQLcovid19.GetConnection(), upper_corrected_samples_list, 'LSPQ', False) #self.pd_metadata['sample'] = self.pd_metadata['sample'].str.replace('LSPQ-','') pas necessaire self.pd_metadata['sample'] = self.pd_metadata['sample'].str.strip(' ') self.pd_metadata['temp'] = self.pd_metadata['sample'].apply( self.GetBelugaIdFromMySQLid) self.pd_metadata['sample'] = self.pd_metadata['temp'] self.pd_metadata = self.pd_metadata.drop(columns=['temp']) pd_missing_samples = self.CheckMissingSpec(self.pd_metadata, self.samples_list) pd_missing_get_from_sgil_extract = self.AddMissingFromSgilExtract( pd_missing_samples, self.pd_sgil_extract, self.pd_metadata.columns) self.pd_metadata = pd.concat( [self.pd_metadata, pd_missing_get_from_sgil_extract]) #self.pd_metadata.to_csv("/home/[email protected]/temp/20201111/test.tsv",sep="\t",index=False) #self.pd_metadata['sample'] = self.pd_metadata['sample'].str.replace('LSPQ-','') self.pd_metadata['sample'] = self.pd_metadata['sample'].str.strip(' ') pd_missing_samples = self.CheckMissingSpec(self.pd_metadata, self.samples_list) pd_missing_get_from_EnvoisGenomeQuebec = self.AddMissingFromEnvoisGenomeQuebec( pd_missing_samples, self.pd_envoi_qenome_quebec, self.pd_metadata.columns) self.pd_metadata = pd.concat( [self.pd_metadata, pd_missing_get_from_EnvoisGenomeQuebec]) #self.pd_metadata['sample'] = self.pd_metadata['sample'].str.replace('LSPQ-','') pas necessaire self.pd_metadata['sample'] = self.pd_metadata['sample'].str.strip(' ') self.pd_missing_samples = self.CheckMissingSpec( self.pd_metadata, self.samples_list) self.pd_metadata['sample_date'] = pd.to_datetime( self.pd_metadata.sample_date) self.pd_metadata['sample_date'] = self.pd_metadata[ 'sample_date'].dt.strftime('%Y-%m-%d') self.pd_metadata = self.pd_metadata.drop_duplicates(subset='sample', keep='first') self.pd_metadata.reset_index(drop=True, inplace=True) self.pd_samples_missing_rss = self.pd_metadata.loc[ self.pd_metadata['rss'] == 'INDETERMINE', ['sample']] self.pd_metadata = self.pd_metadata.loc[ self.pd_metadata['rss'] != 'INDETERMINE', :] #print(self.pd_metadata.index) #print(self.pd_metadata[self.pd_metadata.index.duplicated()]) self.pd_metadata.loc[self.pd_metadata['sample'].str.contains('HGA-'), 'sample'] = self.pd_metadata['sample'] + '2D' self.pd_metadata = self.pd_metadata.sort_values(by=['sample']) self.pd_metadata['sample_date'] = self.pd_metadata[ 'sample_date'].astype('datetime64[ns]') #print(self.pd_metadata.dtypes) #self.pd_metadata = self.pd_metadata.loc[(self.pd_metadata['sample_date'] <= max_sample_date) & (self.pd_metadata['sample_date'] >= year_2020 ),:] self.pd_metadata = self.pd_metadata.loc[ (self.pd_metadata['sample_date'] >= min_sample_date) & (self.pd_metadata['sample_date'] <= max_sample_date) & (self.pd_metadata['sample_date'] >= year_2020), :] #print(self.pd_metadata) #print(self.pd_samples_missing_rss) self.pd_metadata.loc[(self.pd_metadata['OUTBREAK'].isnull()) | (self.pd_metadata['OUTBREAK'] == 'NA'), ['OUTBREAK']] = 'NoOutbreakRelated' if _outbreak_: self.pd_metadata = self.pd_metadata.loc[~( (self.pd_metadata['OUTBREAK'] == 'NoOutbreakRelated') & (self.pd_metadata['sample'].isin(tolerated_rej_samples))), :]