def test_duplicate_column(self): columns = ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID', 'ENSEMBL_ID', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1', 'ORDER'] columns2 = ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'ENSEMBL_ID', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'ENSEMBL_ID.1', 'UNIGENE_ID', 'ENSEMBL_ID.2', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1', 'ORDER'] gpl = GEO.get_GEO(filepath=join(download_geo, "GPL4133.txt")) self.assertEqual(list(gpl.columns.index), columns) gpl2 = GEO.get_GEO(filepath=join(download_geo, "GPL4134.txt")) self.assertEqual(list(gpl2.columns.index), columns2)
def test_get_geo_and_data_with_annotations(self): gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.name, "GPL96") self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96") self.assertEqual(len(gpl.table.index), 22283) self.assertEqual(len(gpl.columns), 21)
def test_pivot_samples(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") result = read_table( join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0) result.columns.name = 'name' assert_frame_equal(gse.pivot_samples("VALUE"), result)
def test_get_geo_and_data(self): gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.name, "GPL96") self.assertEqual(gpl.get_accession(), "GPL96") self.assertEqual(len(gpl.table.index), 22283) self.assertEqual(len(gpl.columns), 16)
def test_download_SRA_parallel_by_sra(self): geo_id = 'GSE63525' # Hi-C dataset from Rao et al. def filterby(x): return 'HIC173' in x.metadata['title'][0] \ or 'HIC174' in x.metadata['title'][0] \ or 'HIC175' in x.metadata['title'][0] destdir = "./TMP_SOFT_parallel_by_sra" gse = GEO.get_GEO(geo=geo_id, destdir=destdir) downloaded_paths = gse.download_SRA("*****@*****.**", # some unused e-mail directory=destdir, filetype='sra', filterby=filterby, silent=True, keep_sra=True, nproc=3) print(downloaded_paths) self.assertTrue(isdir(destdir)) self.assertEqual(len(downloaded_paths), 3) for k in downloaded_paths.keys(): self.assertTrue(k in gse.gsms.keys()) for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']: self.assertTrue(k in downloaded_paths.keys()) for k in downloaded_paths.keys(): for f in downloaded_paths[k]['SRA']: self.assertTrue(isfile(f))
def test_download_SRA_parallel_by_gsm(self): geo_id = 'GSE63525' # Hi-C dataset from Rao et al. def filterby(x): return 'HIC173' in x.metadata['title'][0] \ or 'HIC174' in x.metadata['title'][0] \ or 'HIC175' in x.metadata['title'][0] destdir = "./TMP_SOFT_parallel_by_gsm" gse = GEO.get_GEO(geo=geo_id, destdir=destdir) gsms_to_use = [gsm for gsm in gse.gsms.values() if filterby(gsm)] downloaded_paths = dict() for gsm in gsms_to_use: downloaded_paths[gsm.name] = gsm.download_SRA("*****@*****.**", # some unused e-mail directory=destdir, nproc=3, return_list=False, filetype='sra', silent=True, keep_sra=True) self.assertTrue(isdir(destdir)) self.assertEqual(len(downloaded_paths), 3) for k in downloaded_paths.keys(): self.assertTrue(k in gse.gsms.keys()) for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']: self.assertTrue(k in downloaded_paths.keys()) for k in downloaded_paths.keys(): for f in downloaded_paths[k]['SRA']: self.assertTrue(isfile(f))
def test_get_geo_and_data(self): gsm = GEO.get_GEO(geo="GSM11805", destdir=download_geo) self.assertTrue(isinstance(gsm, GSM)) self.assertEqual(gsm.get_accession(), "GSM11805") self.assertEqual(len(gsm.table.index), 22283) self.assertEqual(len(gsm.columns), 3) self.assertEqual(len(gsm.metadata.keys()), 28)
def test_get_geo_and_data(self): gds = GEO.get_GEO(geo="GDS507", destdir=download_geo) self.assertTrue(isinstance(gds, GDS)) self.assertEqual(len(gds.table.index), 22645) self.assertEqual(len(gds.table.columns), 19) self.assertEqual(len(gds.metadata.keys()), 16) # we omit DATABASE and SUBSET ! entries self.assertEqual(len(gds.database.metadata.keys()), 5) for subset_name, subset in iteritems(gds.subsets): self.assertEqual(len(subset.metadata.keys()), 4) self.assertTrue(isinstance(subset, GDSSubset))
def test_merge_and_average(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") result = read_table(join(download_geo, "test_merged_by_id_and_averaged_by_gb_acc.tab"), index_col=0) result = result.ix[sorted(result.index), sorted(result.columns)] # gse.gsms is a dict so the columns might be in different order merged = gse.merge_and_average(gse.gpls[gse.gpls.keys()[0]], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF") merged = merged[sorted(merged.columns)] # gse.gsms is a dict so the columns might be in different order assert_frame_equal(merged, result) with self.assertRaises(KeyError): gse.merge_and_average("platform", "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF") with self.assertRaises(ValueError): gse.merge_and_average(["platform"], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
def test_pivot_and_annotate(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") gpl = gse.gpls[next(iter(gse.gpls))] result = read_table(join(download_geo, "test_sample_pivoted_by_value_and_annotated_by_gbacc.tab"), index_col=0) result.columns.name = 'name' pivoted = gse.pivot_and_annotate(values="VALUE", gpl=gpl, annotation_column="GB_ACC") assert_frame_equal(result, pivoted) assert_frame_equal(gse.pivot_and_annotate(values="VALUE", gpl=gpl.table, annotation_column="GB_ACC"), result) with self.assertRaises(TypeError): gse.pivot_and_annotate(values="VALUE", gpl="gpl", annotation_column="GB_ACC")
def test_annotate(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") gsm = gse.gsms["Triple-Fusion Transfected Embryonic Stem Cells Replicate 1"] result = read_table(join(download_geo, "test_gsm_annotated.tab")) gpl = gse.gpls[next(iter(gse.gpls))] assert_frame_equal(result, gsm.annotate(gpl, annotation_column="GB_ACC")) assert_frame_equal(result, gsm.annotate(gpl.table, annotation_column="GB_ACC")) with self.assertRaises(TypeError): gsm.annotate("platform", annotation_column="GB_ACC") gsm.annotate(gpl.table, annotation_column="GB_ACC", in_place=True) assert_frame_equal(result, gsm.table)
def test_soft_format_gse(self): print download_geo gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo) self.assertTrue(isinstance(gse, GSE)) self.assertEqual(gse.get_accession(), "GSE1563") self.assertEqual(len(gse.gsms.keys()), 62) self.assertEqual(len(gse.gpls.keys()), 1) self.assertEqual(len(gse.gpls[gse.gpls.keys()[0]].table.index), 12625) self.assertEqual(len(gse.gsms[gse.gsms.keys()[0]].table.index), 12625) for gsm_name, gsm in gse.gsms.iteritems(): self.assertEqual(len(gsm.table.index), 12625) self.assertTrue(isinstance(gsm, GSM)) for gpl_name, gpl in gse.gpls.iteritems(): self.assertEqual(len(gpl.table.index), 12625) self.assertTrue(isinstance(gpl, GPL))
def test_download_SRA(self): gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo) self.assertTrue(isinstance(gse, GSE)) self.assertEqual(gse.get_accession(), "GSE1563") self.assertEqual(len(gse.gsms.keys()), 62) self.assertEqual(len(gse.gpls.keys()), 1) self.assertEqual(len(gse.gpls[next(iter(gse.gpls))].table.index), 12625) self.assertEqual(len(gse.gsms[next(iter(gse.gsms))].table.index), 12625) for gsm_name, gsm in iteritems(gse.gsms): self.assertEqual(len(gsm.table.index), 12625) self.assertTrue(isinstance(gsm, GSM)) for gpl_name, gpl in iteritems(gse.gpls): self.assertEqual(len(gpl.table.index), 12625) self.assertTrue(isinstance(gpl, GPL))
def test_get_geo_gpl_partially(self): partial = [ "GSM1662787", "GSM1662789", "GSM1662791", "GSM1859499" ] gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo, include_data=True, partial=partial) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.get_accession(), "GPL20082") for gsm in gpl.gsms: self.assertTrue(gsm in partial) self.assertEqual(4, len(gpl.gsms))
def test_get_geo_gpl_sequencing(self): gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo, include_data=True) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.get_accession(), "GPL20082") samples = [ "GSM1662787", "GSM1662788", "GSM1662789", "GSM1662790", "GSM1662791", "GSM1677167", "GSM1859499", "GSM1875285" ] for sample in samples: self.assertTrue(sample in gpl.gsms) self.assertEqual(6, len(gpl.gses["GSE68087"].gsms)) self.assertEqual(2, len(gpl.gses["GSE67974"].gsms))
def GSEA (geo_ID, gene_list): gse = GEOparse.get_GEO(geo=geo_ID, destdir="./") expression = gse.pivot_samples('VALUE').T experiments = {} for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()): tmp = {} tmp["Type"] = 1 if "control" in row["description"] else 0 experiments[i] = tmp experiments = pd.DataFrame(experiments).T counter = 0 all_genes_set = [] all_corr_set = [] genes_corr_set = [] for gene in expression: counter += 1 if counter <= 3: continue all_genes_set.append(gene) corr_matrix = np.corrcoef([list(experiments['Type']), list(expression[gene])]) all_corr_set.append(corr_matrix[0,1]) if gene in gene_list: genes_corr_set.append(corr_matrix[0,1]) p_value = ks_2samp(genes_corr_set, all_corr_set)[1] return(str(p_value))
def test_no_table(self): try: gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM2795971.txt'), geotype='GSM') except Exception: self.fail("No data in the file error.")
def handle(self, *args, **options): """Refreshes the metadata for all experiments, or experiments from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: experiments = Experiment.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] experiments = Experiment.objects.filter( source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: logger.debug("Refreshing metadata for an experiment.", experiment=experiment.accession_code) try: if experiment.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( experiment.samples.first().accession_code) SraSurveyor._apply_metadata_to_experiment( experiment, metadata) elif experiment.source_database == "GEO": gse = GEOparse.get_GEO( experiment.accession_code, destdir="/tmp/management", silent=True, ) GeoSurveyor._apply_metadata_to_experiment( experiment, gse) elif experiment.source_database == "ARRAY_EXPRESS": request_url = EXPERIMENTS_URL + experiment.accession_code experiment_request = utils.requests_retry_session( ).get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error( "Remote experiment has no Experiment data!", experiment_accession_code=experiment. accession_code, survey_job=self.survey_job.id, ) continue ArrayExpressSurveyor._apply_metadata_to_experiment( experiment, parsed_json) experiment.save() # If there are any errors, just continue. It's likely that it's # just a problem with this experiment. except Exception: logger.exception( "exception caught while updating metadata for {}". format(experiment.accession_code)) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def handle(self, *args, **options): """Re-surveys GEO experiments containing samples with incorrect platform information. """ # Check against CDF corrected accessions table to prevent recorrection of the same samples. corrected_experiments = CdfCorrectedAccession.objects.all().values( "accession_code") gse_experiments = Experiment.objects.filter( source_database="GEO").exclude( accession_code__in=corrected_experiments) paginator = Paginator(gse_experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: try: gse = GEOparse.get_GEO(experiment.accession_code, destdir=GEO_TEMP_DIR, how="brief", silent=True) sample_accessions = list(gse.gsms.keys()) samples = Sample.objects.filter( accession_code__in=sample_accessions) wrong_platform = False for sample in samples: gpl = gse.gsms[ sample.accession_code].metadata["platform_id"][0] internal_accession = get_internal_microarray_accession( gpl) if internal_accession != sample.platform_accession_code: wrong_platform = True break if wrong_platform: if options["dry_run"]: logger.info( "Would have re-surveyed experiment with accession code %s", experiment.accession_code, ) else: logger.info( "Re-surveying experiment with accession code %s", experiment.accession_code, ) purge_experiment(experiment.accession_code) queue_surveyor_for_accession( experiment.accession_code) current_time = timezone.now() CdfCorrectedAccession( accession_code=experiment.accession_code, created_at=current_time).save() except Exception: logger.exception("Caught an exception with %s!", experiment.accession_code) finally: # GEOparse downloads files here and never cleans them up! Grrrr! download_path = GEO_TEMP_DIR + experiment.accession_code + "_family.soft.gz" # It's not a directory, but ignore_errors is useful. try: os.remove(download_path) except Exception: # Don't anything interrupt this, like say, # GEOParse downloading a directory instead of # a file... logger.exception("Failed to delete an archive.") if not page.has_next(): break page = paginator.page(page.next_page_number())
import GEOparse cache = "E:/ncbigeo" gpl_data = GEOparse.get_GEO(geo="GPL77", destdir=cache, silent=True)
import GEOparse import pandas as pd working_dir = "/home/yunsunglee/data/second_project/" target = 'GSE118260' gse = GEOparse.get_GEO(target, destdir=working_dir) ### Pull phenotype data info = gse.phenotype_data ### Stack genomic data for one_person in info.index: if one_person == info.index[0]: expr = gse.gsms[one_person].table[['ID_REF', 'VALUE']] expr.columns = ['ID_REF', one_person] else: t1 = gse.gsms[one_person].table[['ID_REF', 'VALUE']] t1.columns = ['ID_REF', one_person] expr = expr.merge(t1, on="ID_REF", how="left") print(one_person) ### Save the resulting data in your working directory feather.write_dataframe(expr, dest=working_dir + target + "_expr.feather") feather.write_dataframe(info, dest=working_dir + target + "_info.feather")
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ( "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + experiment_accession_code) experiment_object.source_database = "GEO" experiment_object.title = gse.metadata.get('title', [''])[0] experiment_object.description = gse.metadata.get('summary', [''])[0] # Source doesn't provide time information, assume midnight. submission_date = gse.metadata["submission_date"][ 0] + " 00:00:00 UTC" experiment_object.source_first_published = dateutil.parser.parse( submission_date) last_updated_date = gse.metadata["last_update_date"][ 0] + " 00:00:00 UTC" experiment_object.source_last_updated = dateutil.parser.parse( last_updated_date) unique_institutions = list(set(gse.metadata["contact_institute"])) experiment_object.submitter_institution = ", ".join( unique_institutions) experiment_object.pubmed_id = gse.metadata.get("pubmed_id", [""])[0] # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata['organism_ch1'][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( 'data_processing', None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata['title'][0] self.set_platform_properties(sample_object, sample.metadata, gse) # Directly assign the harmonized properties harmonized_sample = harmonized_samples[sample_object.title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( 'supplementary_file', []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if '.cel' in lower_file_url \ or ('_non_normalized.txt' in lower_file_url) \ or ('_non-normalized.txt' in lower_file_url) \ or ('-non-normalized.txt' in lower_file_url) \ or ('-non_normalized.txt' in lower_file_url): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = supplementary_file_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = 'MICROARRAY' sample_object.manufacturer = 'AFFYMETRTIX' sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != 'RNA-SEQ': created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( 'supplementary_file', []): # filename and source_filename are the same for these filename = experiment_supplement_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if ('_non_normalized.txt' in lower_supplement_url) \ or ('_non-normalized.txt' in lower_supplement_url) \ or ('-non-normalized.txt' in lower_supplement_url) \ or ('-non_normalized.txt' in lower_supplement_url): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0: original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split('/')[-1], has_raw=sample_object.has_raw, is_archive=True)[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0: miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def DownloadGEODataset(gse, gseDir): # Query for retriecing GEO data gseData = GEOparse.get_GEO(geo=gse, destdir=gseDir, how='full', annotate_gpl=True, include_data=True, silent=True) # Initialise data containers exprData = pd.DataFrame(columns=["ID_REF"]) metaData = pd.DataFrame() exprDataMapped = [] exprDataFiles = [] metaDataFiles = [] # Iterating over all GSM samples in GSE and collect data JUST if RNA-seq for gsmName, gsm in gseData.gsms.items(): if gsm.metadata['type'][0] == 'RNA' and len( gsm.table ) > 0: # We are looking at RNA-seq data with stored data ##----------------------## ## Expression ## ##----------------------## # extract expression data into a pandas dataframe tmpExpr = pd.DataFrame({ "ID_REF": list(gsm.table['ID_REF']), gsmName: list(gsm.table['VALUE']) }) # Appending to exprData (by column) exprData = tmpExpr if exprData.shape[0] == 0 else exprData.merge( tmpExpr, how='outer', on='ID_REF') ##----------------------## ## Metadata ## ##----------------------## # extract metadata into a pandas dataframe tmpMetadata = pd.DataFrame(gsm.metadata.items(), columns=['MetaData', gsmName]) # manipulate metaData for being used by subsequent analyses tmpMetadata = tmpMetadata.set_index( 'MetaData') # make metadata column as row name tmpMetadata = tmpMetadata.transpose() # Appending to metaData (by column) metaData = tmpMetadata if metaData.shape[ 0] == 0 else metaData.append(tmpMetadata) # We'll map the platform data JUST if we have data collected in the exprData if exprData.shape[0] > 0: ##----------------------## ## platform ## ##----------------------## # Here we aim at mapping the ID_REF to the gene name for gplName, gpl in gseData.gpls.items(): # Get name of column containing gene symbol and ID information idCol = [ col for col in gpl.table.columns if re.search("^ID", col, re.IGNORECASE) ] gsCol = [ col for col in gpl.table.columns if re.search("^gene(.+|)(symbol|name|id)", col, re.IGNORECASE) ] # Check if gsCol is present if len(gsCol) > 0 and len(idCol) > 0: platformData = pd.DataFrame({ "ID_REF": list(gpl.table[idCol[0]]), "geneName": list(gpl.table[gsCol[0]]), }) ##----------------------------------## ## Map genes to gene platforms ## ##----------------------------------## exprDataMapped.append( pd.merge(platformData, exprData, how="inner", on="ID_REF")) else: exprDataMapped.append(exprData) ##-----------------------------------------------## ## Save datasets into files -- or delete folder ## ##-----------------------------------------------## # Iterate over mapped expression data if len(exprDataMapped) > 0: for index, eData in enumerate(exprDataMapped): if eData.shape[0] > 0 and metaData.shape[0] > 0: exprDataFile = "{0}/{1}.exprs.{2}.tsv".format( gseDir, gse, index) metaDataFile = "{0}/{1}.meta.{2}.tsv".format( gseDir, gse, index) eData.to_csv(exprDataFile, sep='\t', index=False, encoding='utf-8') metaData.to_csv(metaDataFile, sep='\t', index=True, encoding='utf-8') exprDataFiles.append(exprDataFile) metaDataFiles.append(metaDataFile) # Clean soft.gz files subprocess.Popen("rm {0}/*.soft.gz".format(gseDir), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() # Return metadata and expression files return exprDataFiles, metaDataFiles
import GEOparse import numpy as np import pandas as pd import matplotlib.pyplot as plt probes_conv = GEOparse.parse_GSM( \ "/home/cyril/Documents/Master/sem_1/Case_study/module1/data/GPL6457_old_annotations.txt.gz") gse = GEOparse.get_GEO("GSE24616", destdir="./") # gse = GEOparse.get_GEO(filepath="./GSM606890.TXT.GZ.soft") char = {"stage": [], "time": [], "sex": [], "sample_name": []} for gsm_name, gsm in sorted(gse.gsms.iteritems()): char["stage"].append(gsm.metadata['characteristics_ch1'][1].split(": ")[1]) char["time"].append(gsm.metadata['characteristics_ch1'][2].split(": ")[1]) char["sex"].append(gsm.metadata['characteristics_ch1'][3].split(": ")[1]) char["sample_name"].append(gsm.name) print(char["stage"][3], char["time"][3], char["sex"][3], char["sample_name"][3]) GPL = gse.gpls.values()[0] pivoted_samples = gse.pivot_samples('VALUE') pivoted_samples.set_index(GPL.table.SPOT_ID, inplace=True) # pivoted_samples.hist() strata = pd.read_csv("../phylostrata.txt", sep="\t", header=None) strata.columns = ["GeneID", "ProbeID", "age"] strata.set_index("ProbeID", inplace=True)
def test_name(self): gse = GEO.get_GEO(filepath=join(download_geo, "GSE105845_family.soft"), geotype="GSE") self.assertEqual(gse.name, "GSE105845")
def fetch_data(gse_acc): mkdir("./" + gse_acc + "/") gse = GEOparse.get_GEO(geo=gse_acc, destdir="./" + gse_acc + "/") #pivoted_control_samples = gse.pivot_samples('VALUE') return gse
import numpy as np from pathlib import Path import GEOparse import pandas as pd from matplotlib import pyplot as plt path = Path( "C:/Users/Daniel/Desktop/Andi project/database/Soft/GSE64392_family.soft") num_top_n = 1000 gse = GEOparse.get_GEO(filepath=str(path)) cntr = 1 for gsm_name, gsm in gse.gsms.items(): print(f"Analysing dataset {cntr}/{len(gse.gsms)}") temp_df = pd.DataFrame(data=gsm.table["VALUE"].values.transpose(), index=gsm.table["ID_REF"], columns=[gsm_name]) if cntr == 1: joined_df = temp_df.copy() else: joined_df = joined_df.join(temp_df) cntr += 1 sorted_means = joined_df.mean(axis=1).sort_values()[:num_top_n] gpl_table = gse.gpls[list(gse.gpls.keys())[0]].table
# #for j in range(1,144): # page = requests.get("https://ftp.ncbi.nih.gov/geo/series/GSE"+str(j)+'nnn'+"/") # tree = html.fromstring(page.content) # r = tree.xpath('//a/text()') # #print(type(r)) # del r[0] # num.append(len(r)) # for i in r: # nr.append(i.replace("/","")) #print("array 생성완료 ") #nr = pd.read_csv('./nr.tsv', sep='\t').values.tolist() nd = pd.read_csv('nd.tsv', sep='\t').values.tolist() for i in nd[96000:]: gse = GEOparse.get_GEO(geo=i[0], destdir="./geoData2") #return filepath, getype #print() #print("GSM example:") #for gsm_name,gsm in gse.gsms.items(): # print("Name: ", gsm_name) # print("Metadata:",) # for key, value in gsm.metadata.items(): # print(" - %s : %s" % (key, ", ".join(value))) # print ("Table data:",) # print (gsm.table.head()) # break #print()
# #for j in range(1,144): # page = requests.get("https://ftp.ncbi.nih.gov/geo/series/GSE"+str(j)+'nnn'+"/") # tree = html.fromstring(page.content) # r = tree.xpath('//a/text()') # #print(type(r)) # del r[0] # num.append(len(r)) # for i in r: # nr.append(i.replace("/","")) #print("array 생성완료 ") #nr = pd.read_csv('./nr.tsv', sep='\t').values.tolist() nd = pd.read_csv('nd.tsv',sep='\t').values.tolist() for i in nd[50000:62000]: gse = GEOparse.get_GEO(geo=i[0], destdir="/drive/My Drive/geoData4") print(str(i)) #return filepath, getype #print() #print("GSM example:") #for gsm_name,gsm in gse.gsms.items(): # print("Name: ", gsm_name) # print("Metadata:",) # for key, value in gsm.metadata.items(): # print(" - %s : %s" % (key, ", ".join(value))) # print ("Table data:",) # print (gsm.table.head()) # break
def load_series(self, geo_id: str) -> GSE: return GEOparse.get_GEO(geo_id, destdir=self.temp)
def set_platform_properties(self, sample_object: Sample, sample_metadata: Dict, gse: GEOparse.GSM) -> Sample: """Sets platform-related properties on `sample_object`. Uses metadata from `gse` to populate platform_name, platform_accession_code, and technology on `sample_object`. """ # Determine platform information external_accession = get_normalized_platform( gse.metadata.get("platform_id", [UNKNOWN])[0]) if external_accession == UNKNOWN: sample_object.platform_accession_code = UNKNOWN sample_object.platform_name = UNKNOWN sample_object.manufacturer = UNKNOWN # If this sample is Affy, we potentially can extract the # platform information from the .CEL file. If it's not we # can't do anything. Therefore assume the technology is # microarray when we have no platform information. sample_object.technology = "MICROARRAY" return sample_object platform_accession_code = UNKNOWN gpl = GEOparse.get_GEO(external_accession, destdir=self.get_temp_path(), how="brief", silent=True) platform_title = gpl.metadata.get("title", [UNKNOWN])[0] # Check if this is a supported microarray platform. for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: platform_accession_code = platform["platform_accession"] if platform_accession_code != UNKNOWN: # It's a supported microarray platform. # We are using the brain array package as the platform accession code, # so, for instance, GPL3213 becomes 'chicken'. sample_object.platform_accession_code = platform_accession_code sample_object.technology = "MICROARRAY" try: # Related: https://github.com/AlexsLemonade/refinebio/issues/354 # If it's Affy we can get a readable name: sample_object.platform_name = get_readable_affymetrix_names( )[platform_accession_code] sample_object.manufacturer = "AFFYMETRIX" # Sometimes Affymetrix samples have weird channel # protocol metadata, so if we find that it's # Affymetrix return it now. Example: GSE113945 return sample_object except KeyError: # Otherwise we'll use what we've got. sample_object.platform_name = platform_title # Determine manufacturer platform = sample_object.pretty_platform.upper() if "AGILENT" in platform: sample_object.manufacturer = "AGILENT" elif "ILLUMINA" in platform or "NEXTSEQ" in platform: sample_object.manufacturer = "ILLUMINA" elif "AFFYMETRIX" in platform: sample_object.manufacturer = "AFFYMETRIX" else: sample_object.manufacturer = UNKNOWN return sample_object # Check to see if this is a supported RNASeq technology: # GEO RNASeq platform titles often have organisms appended to # an otherwise recognizable platform. The list of supported # RNASeq platforms isn't long, so see if any of them are # contained within what GEO gave us. # Example: GSE69572 has a platform title of: # 'Illumina Genome Analyzer IIx (Glycine max)' # Which should really just be 'Illumina Genome Analyzer IIx' # because RNASeq platforms are organism agnostic. However, # the platforms 'Illumina Genome Analyzer' and 'Illumina # Genome Analyzer II' would also be matched, so make sure that # the longest platform names are tested first: sorted_platform_list = get_supported_rnaseq_platforms().copy() sorted_platform_list.sort(key=len, reverse=True) for platform in sorted_platform_list: if platform.upper() in platform_title.upper(): sample_object.technology = "RNA-SEQ" sample_object.platform_name = platform # We just use RNASeq platform titles as accessions sample_object.platform_accession_code = platform if "ILLUMINA" in sample_object.platform_name.upper(): sample_object.manufacturer = "ILLUMINA" elif "NEXTSEQ" in sample_object.platform_name.upper(): sample_object.manufacturer = "NEXTSEQ" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = UNKNOWN return sample_object # If we've made it this far, we don't know what this platform # is, therefore we can't know what its technology is. What we # do know is what GEO said was it's platform's accession and # title are, and that it's unsupported. sample_object.platform_name = platform_title sample_object.platform_accession_code = external_accession sample_object.technology = UNKNOWN sample_object.manufacturer = UNKNOWN return sample_object
import GEOparse #i=1 for i in range(13037, 14000): gse = GEOparse.get_GEO(geo="GSE" + str(i), destdir="./geoData") #return filepath, getype #print() #print("GSM example:") #for gsm_name,gsm in gse.gsms.items(): # print("Name: ", gsm_name) # print("Metadata:",) # for key, value in gsm.metadata.items(): # print(" - %s : %s" % (key, ", ".join(value))) # print ("Table data:",) # print (gsm.table.head()) # break #print() #print("GPL example:") #for gpl_name, gpl in gse.gpls.items(): # print("Name: ", gpl_name) # print("Metadata:",) # for key, value in gpl.metadata.items(): # print(" - %s : %s" % (key, ", ".join(value))) # print("Table data:",) # print(gpl.table.head()) # break
def download_and_parse_geo_data( geo_id, directory_path=getcwd(), ): print('Establishing {} @ {} ...'.format( geo_id, directory_path, )) gse = GEOparse.get_GEO( geo=geo_id, destdir=directory_path, ) print('Title: {}'.format(gse.get_metadata_attribute('title'))) print('N sample: {}'.format(len(gse.get_metadata_attribute('sample_id')))) geo_dict = { 'id_x_sample': None, 'id_gene_symbol': None, 'gene_x_sample': None, 'information_x_sample': None, } values = [] for sample_id, gsm in gse.gsms.items(): print('{} ...'.format(sample_id)) sample_table = gsm.table if sample_table.empty: raise ValueError( 'Sample {} has empty table (perhaps this is a single cell experiment.)' .format(gsm.name)) sample_table.columns = sample_table.columns.str.lower().str.replace( ' ', '_', ) sample_values = sample_table.set_index('id_ref').squeeze() sample_values.name = sample_id if isinstance( sample_values, DataFrame, ): sample_values.columns = ('{} ({})'.format( sample_id, column, ) for column in sample_values.columns) values.append(sample_values) geo_dict['id_x_sample'] = concat( values, axis=1, ).sort_index().sort_index(axis=1) print('id_x_sample.shape: {}'.format(geo_dict['id_x_sample'].shape)) id_gene_symbol = None for platform_id, gpl in gse.gpls.items(): print('{} ...'.format(platform_id)) platform_table = gpl.table platform_table.columns = platform_table.columns.str.lower( ).str.replace( ' ', '_', ) platform_table.set_index( 'id', inplace=True, ) if 'gene_symbol' not in platform_table.columns: if 'gene_assignment' in platform_table.columns: gene_symbols = [] for assignment in platform_table['gene_assignment']: if not isna(assignment) and '//' in assignment: gene_symbols.append( assignment.split(sep='//')[1].strip()) else: gene_symbols.append('NO GENE NAME') platform_table['gene_symbol'] = gene_symbols elif 'oligoset_genesymbol' in platform_table.columns: platform_table['gene_symbol'] = platform_table[ 'oligoset_genesymbol'] elif 'ilmn_gene' in platform_table.columns: platform_table['gene_symbol'] = platform_table['ilmn_gene'] elif 'gene' in platform_table.columns: platform_table['gene_symbol'] = platform_table['gene'] if 'gene_symbol' in platform_table: id_gene_symbol = platform_table['gene_symbol'].dropna() id_gene_symbol.index = id_gene_symbol.index.astype(str) geo_dict['id_gene_symbol'] = id_gene_symbol print('id_gene_symbol.shape:{}'.format(id_gene_symbol.shape)) print('N valid gene_symbol: {}'.format( (id_gene_symbol != 'NO GENE NAME').sum())) gene_x_sample = geo_dict['id_x_sample'].copy() id_gene_symbol = id_gene_symbol.to_dict() gene_x_sample.index = geo_dict['id_x_sample'].index.map( lambda index: id_gene_symbol.get( str(index), 'NO GENE NAME', )) gene_x_sample.drop( 'NO GENE NAME', inplace=True, errors='ignore', ) gene_x_sample.index.name = 'gene_symbol' geo_dict['gene_x_sample'] = gene_x_sample.sort_index().sort_index( axis=1) print('gene_x_sample.shape: {}'.format( geo_dict['gene_x_sample'].shape)) else: print( '\tgene_symbol is not a GPL column ({}); IDs may be already gene symbols.' .format(', '.join(platform_table.columns))) geo_dict['information_x_sample'] = gse.phenotype_data.T print('information_x_sample.shape: {}'.format( geo_dict['information_x_sample'].shape)) return geo_dict
def test_get_geo_and_data(self): gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.get_accession(), "GPL96") self.assertEqual(len(gpl.table.index), 22283) self.assertEqual(len(gpl.columns), 16)
def micro_analysis(accession_id, control_samples, treated_samples): #Creating a dictionary of assigned control and treated samples control_samples = {i: 'control' for i in control_samples} treated_samples = {i: 'treated' for i in treated_samples} all_samples = merge(control_samples, treated_samples) #Parse the GEO data using the Accession ID gse = GEOparse.get_GEO(geo=accession_id, destdir="./") #Create a list of samples to use in the development of the expression matrix list_samples = list(all_samples.keys()) #Visualization of expression matrix pivoted_samples = gse.pivot_samples('VALUE')[list_samples] pivoted_samples.head() #Determine the total amount of probes used in the study pivoted_samples_average = pivoted_samples.median(axis=1) #Filtering out unexpressed probes expression_threshold = pivoted_samples_average.quantile(0.3) expressed_probes = pivoted_samples_average[ pivoted_samples_average >= expression_threshold].index.tolist() #Redefine expression data using only the expressed probes exprsdata = gse.pivot_samples("VALUE").loc[expressed_probes] exprsdata = exprsdata.T #Deletes additional samples that aren't being analyzed exprsdata = exprsdata[exprsdata.index.isin(list_samples)] #Drop any probe columns where expression data is missing or negative exprsdata.dropna(axis=1) #Quantile normalization of data rank_mean = exprsdata.stack().groupby( exprsdata.rank(method='first').stack().astype(int)).mean() exprsdata.rank( method='min').stack().astype(int).map(rank_mean).unstack().dropna( axis=1) #Making Dataframe of samples samplesDf = pd.DataFrame.from_dict(all_samples, orient='index', columns=['type']) samplesDf.reset_index(inplace=True) #Transpose data matrix for sorting, index correlated to probe IDs exprsdata = exprsdata.T #Upload annotation file as dictionary #Reset index and replace with gene symbols, view as dataframe exprsdata = pd.DataFrame(exprsdata) exprsdata.index = exprsdata.index.astype(str, copy=False) exprsdata['symbol'] = exprsdata.index.to_series().map(PROBE2GENE) exprsdata.reset_index(inplace=True) data = exprsdata.set_index('symbol') #Drop probe id column data = data.drop('ID_REF', axis=1) #Drop rows that aren't associated with a particular gene symbol data = data.reset_index().dropna().set_index('symbol') #Utilize warning statements warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) #Make sample classes, ensure that there is a distinction between control/treated samples data_cd = {} sample_classes = {} sample_class = np.zeros(data.shape[1], dtype=np.int32) sample_class[samplesDf['type'].values == 'control'] = 1 sample_class[samplesDf['type'].values == 'treated'] = 2 sample_classes = sample_class #CD results cd_res = chdir(data.values, sample_classes, data.index, gamma=.5, sort=False, calculate_sig=False) cd_coefs = np.array(list(map(lambda x: x[0], cd_res))) srt_idx = np.abs(cd_coefs).argsort()[::-1] cd_coefs = cd_coefs[srt_idx][:600] sorted_DEGs = data.index[srt_idx][:600] up_genes = dict(zip(sorted_DEGs[cd_coefs > 0], cd_coefs[cd_coefs > 0])) dn_genes = dict(zip(sorted_DEGs[cd_coefs < 0], cd_coefs[cd_coefs < 0])) data_cd['up'] = up_genes data_cd['dn'] = dn_genes #Retrieve up and down gene sets up_list = list(up_genes.keys()) dn_list = list(dn_genes.keys()) #Up genes and down genes return up_list, dn_list
def test_get_geo_and_data_with_annotations(self): gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96") self.assertEqual(len(gpl.table.index), 22283) self.assertEqual(len(gpl.columns), 21)
# -*- coding: utf-8 -*- """ "C:\Users\Aditya\Desktop\new_dataset" Created on Thu Nov 21 01:34:39 2019 @author: Aditya """ #For single GSE SOFT file parsing in GEOParse import GEOparse gse = GEOparse.get_GEO(filepath = r'C:\Users\Aditya\Desktop\GSE15824_family.soft.gz', destdir = r'E:\CodingProjects') for name, extra in gse.gsms.items(): name = name.strip('\n') print('\t\t'+name+' transferred\n') gse.gsms[name].table.to_csv(name+'.txt', index = None, sep = '\t', mode = 'w') #For multiple files, store list of GSE files you have downloaded #in a text file and read it sequentially to access all the GSMs #Storing list of files in variable putingeo f = open(r'fetchtheseGSEfromGEO.txt','r') putingeo = f.readlines() f.close() import GEOparse import os num = len(putingeo) for i in putingeo: print (str(num)+' files remaining')
def test_pivot_samples(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") result = read_table(join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0) result.columns.name = 'name' assert_frame_equal(gse.pivot_samples("VALUE"), result)
def test_empty_line(self): try: gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM32878.txt'), geotype='GSM') except IndexError: self.fail("Empty line in the file causes an error.")
import numpy as np from pathlib import Path import GEOparse import pandas as pd from matplotlib import pyplot as plt path = Path("C:/Users/Daniel/Desktop/Andi project/database/Soft/GPL16686_family.soft.gz") gse = GEOparse.get_GEO(filepath=str(path), partial=['GPL16686']) gpl_table = gse.gpls[list(gse.gpls.keys())[0]].table print('Finished!')
def test_name(self): gpl = GEO.get_GEO(filepath=join(download_geo, "GPL20814_family.soft"), geotype="GPL") self.assertEqual(gpl.name, "GPL20814")
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse) experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id, ) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata["organism_ch1"][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( "data_processing", None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata["title"][0] self.set_platform_properties(sample_object, sample.metadata, gse) GeoSurveyor._apply_harmonized_metadata_to_sample( sample_object, harmonized_samples[sample_object.title]) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( "supplementary_file", []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if (".cel" in lower_file_url or ("_non_normalized.txt" in lower_file_url) or ("_non-normalized.txt" in lower_file_url) or ("-non-normalized.txt" in lower_file_url) or ("-non_normalized.txt" in lower_file_url)): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = FileUtils.get_filename(supplementary_file_url) original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=FileUtils.is_archive(filename), )[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = "MICROARRAY" sample_object.manufacturer = "AFFYMETRIX" sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != "RNA-SEQ": created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( "supplementary_file", []): # filename and source_filename are the same for these filename = experiment_supplement_url.split("/")[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True, )[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if (("_non_normalized.txt" in lower_supplement_url) or ("_non-normalized.txt" in lower_supplement_url) or ("-non-normalized.txt" in lower_supplement_url) or ("-non_normalized.txt" in lower_supplement_url)): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0): original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split("/")[-1], has_raw=sample_object.has_raw, is_archive=True, )[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0): miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def load_meta(datasets): n_valid = 0 qc_idx = [] ages = [] injured = [] cell_types = [] # Load metadata for each cell. id_to_meta = {} soft_fnames = [ 'data/microglia/masuda2019/mouse/GSE120744_family.soft.gz', ] for fname in soft_fnames: gsms = GEOparse.get_GEO(filepath=fname, silent=True).gsms for geo_id in gsms: cell_id = gsms[geo_id].metadata['title'][0] meta = { attr.split(':')[0].strip(): attr.split(':')[1].strip() for attr in gsms[geo_id].metadata['characteristics_ch1'] } id_to_meta[cell_id] = meta # Map cell ids to important attributes. for i in range(len(datasets)): with gzip.open(data_names[i] + '.tsv.gz') as f: cell_ids = f.readline().decode('utf-8').rstrip().split()[1:] for cell_id in cell_ids: meta = id_to_meta[cell_id] age_str = meta['age'] if age_str == '16 weeks': age = 16 * 7 age_str = 'P{}'.format(age) elif age_str == 'embryonal': # Sic. age = 16.5 age_str = 'E{}'.format(age) elif age_str == '03_w': age = 3 * 7 age_str = 'P{}'.format(age) elif age_str == '16_w': age = 16 * 7 age_str = 'P{}'.format(age) else: continue if age_str.startswith('P'): min_age = 19. max_age = 60. age = 19 + ((age - min_age) / (max_age - min_age) * 3) ages.append(age) if 'treatment' in meta: if 'demyelination' in meta['treatment']: inj = 'demyelination' elif 'remyelination' in meta['treatment']: inj = 'remyelination' elif 'Facial_nerve_axotomy' in meta['treatment']: inj = 'fxn' else: inj = 'none' else: inj = 'none' injured.append(inj) cell_types.append('{}_{}'.format(age_str, inj)) qc_idx.append(n_valid) n_valid += 1 return qc_idx, np.array(cell_types), np.array(ages), np.array(injured)
import GEOparse from tissuespecific.reconstruction import Builder #set paths path = '/home/acabbia/Documents/Muscle_Model/models' ref_model = path + "/recon2.2.xml" output_folder = path + "/library_GEO_GSE25941_v2/" # import reference model (RECON2.2) recon22 = cobra.io.read_sbml_model(ref_model) #Gene expression data GEO ID (Raue 2012) GEO_accession_nr = "GSE25941" #get data from GEO serie = GEOparse.get_GEO(geo=GEO_accession_nr) gsm = 'GSM637527' #build translator dict table = serie.gsms[gsm].table translator = Builder.affyprobe_translator(table, 'hgnc_id') #Build confidence dict confidence = Builder.rxn_confidence_2(recon22, table, translator, 'hgnc_id') ########################################################################################################################## #%% # reactionsto be added add = [ 'ATPS4m', 'ENO', 'PDHm', 'PYK', 'G3PD1', 'G6PDH2r', 'AKGDm', 'CYOOm3', 'r0913', 'GLCt2_2', 'EX_glc(e)', 'EX_fru(e)', 'EX_ppa(e)', 'EX_but(e)', 'EX_hdca(e)', 'EX_ocdca(e)', 'EX_arach(e)', 'EX_doco13ac_', 'EX_lgnc(e)',
def run(self, inputs, outputs): """Run the analysis.""" if not re.match(r"(GSE\d{1,8})", inputs.gse_accession): self.error( f"GEO series accessions (GSE) are supported but {inputs.gse_accession} was provided." ) try: gse = GEOparse.get_GEO(geo=inputs.gse_accession, destdir="./") except IOError: self.error( f"Download of {inputs.gse_accession} failed. ID could be incorrect or the data might not be " "public yet.") except Exception as err: self.error( f"Download of {inputs.gse_accession} failed. GEO parse failed with {err}" ) supported = [ "Expression profiling by high throughput sequencing", "Expression profiling by array", ] gse_type = gse.get_type() if type(gse.get_type()) is list else [ gse.get_type() ] if set(gse_type).intersection(set(supported)): if "SuperSeries of" in gse.relations: # This is a mixed GSE series which needs to be unpacked. super_series = [ GEOparse.get_GEO(geo=accession, destdir="./") for accession in gse.relations["SuperSeries of"] ] else: super_series = [gse] else: self.error( f"No supported series types found. Got {', '.join(gse_type)} but only {' and '.join(supported)} " "are supported.") metadata_tables = {} for series in super_series: series_type = series.get_type() if series_type == "Expression profiling by high throughput sequencing": run_info = self.upload_rna_gse(inputs, series) metadata_tables[series.name] = create_metadata( series, run_info) elif series_type == "Expression profiling by array": run_info = self.upload_ma_gse(inputs, series) metadata_tables[series.name] = create_metadata( series, run_info) else: self.warning( f"The upload of {series_type} is currently not supported. Samples from {series.name} will be " "skipped.") meta_file = f"{inputs.gse_accession}_metadata.tsv" metadata = pd.concat(metadata_tables.values(), join="outer", ignore_index=False) metadata.to_csv(meta_file, sep="\t", index=False) self.run_process("upload-orange-metadata", {"src": meta_file}) for entity_name in metadata["mS#Sample name"].values: objects = Data.filter(entity__name=entity_name) if len(objects) > 1: self.warning( f"Multiple samples with entity name {entity_name} are present, descriptor will be added only " "to the last one") obj = objects[-1] obj.entity.descriptor = construct_descriptor( metadata, obj.entity_name)
import GEOparse import pandas as pd import numpy as np from functools import * import re gse1 = GEOparse.get_GEO(filepath="./Data/Human/GSE2508_family.soft.gz") plats_1 = list(gse1.gpls.keys()) samples1 = gse1.phenotype_data[["platform_id", "title"]] sample1 = samples1.groupby(["platform_id"]); sample1.groups d = {} for l in plats_1: ls = "".join(list(sample1.get_group(l)['title'])) lf = re.findall("Lean F", ls) of = re.findall("Obese F", ls) lm = re.findall("Lean M", ls) om = re.findall("Obese M", ls) d[l] = {"LF": len(lf), "OF": len(of), "LM": len(lm), "OM": len(om)} x = samples1.copy() x["samples"] = x.index x["title"] = x['title'].apply(lambda x: x[:-len(x.split()[-1])].strip()).to_frame('samples') x['gender'] = x['title'].map(lambda x: x.split(' ')[1]) x['cbmi'] = x['title'].map(lambda x: x.split(' ')[0].lower()) grouped = x.groupby("title") l = pd.DataFrame.from_dict(grouped.groups) y = x[["title", "gender", "cbmi"]]
def get_geo_database(geo_dataset_id): return GEOparse.get_GEO(geo=geo_dataset_id).table
import GEOparse import attractors gse = GEOparse.get_GEO(filepath="./gse_soft_files/GSE61470_family.soft") for gsm_name, gsm in gse.gsms.items(): print("Name: ", gsm_name) print("Metadata:", ) for key, value in gsm.metadata.items(): print(" - %s : %s" % (key, ", ".join(value))) print("Table data:", ) print(gsm.table.head()) print(gsm.columns) for gpl_name, gpl in gse.gpls.items(): print("Name: ", gpl_name) print("Metadata:", ) for key, value in gpl.metadata.items(): print(" - %s : %s" % (key, ", ".join(value))) print("Table data:", ) print(gpl.table.head()) print(gpl.columns)
def download_soft(gse_acc): mkdir("./" + gse_acc + "/") GEOparse.get_GEO(geo=gse_acc, destdir="./" + gse_acc + "/")
import GEOparse import pandas as pd gse2 = GEOparse.get_GEO(filepath="./Data/Human/GSE26637_family.soft.gz") plats_2 = list(gse2.gpls.keys())[0] samples2 = gse2.phenotype_data[[ "characteristics_ch1.0.gender", "characteristics_ch1.2.stimulation", "characteristics_ch1.3.resistance status" ]] samples2 = samples2.rename( columns={ 'characteristics_ch1.0.gender': 'gender', 'characteristics_ch1.2.stimulation': 'fasting_status', 'characteristics_ch1.3.resistance status': 'insulin_status' }) samples2['cbmi'] = samples2['insulin_status'].map( lambda x: 'lean' if x == 'sensitive' else 'obese') samples2.to_pickle('./Preprocessed_Data/Human/batch2_pheno.p') with open('./Preprocessed_Data/Human/batch2_pheno.txt', 'w') as handle: samples2.to_csv(handle, sep='\t') samples2_exprs = gse2.pivot_samples('VALUE')[list(samples2.index)] samples2_ann = samples2_exprs.reset_index().merge( gse2.gpls['GPL570'].table[["ID", "Gene Symbol"]], left_on='ID_REF', right_on="ID").set_index('ID_REF') samples2_ann.drop('ID', inplace=True, axis=1)
def download(geo_accession): if not os.path.exists("../../data/geo/"): os.makedirs("../../data/geo/") gse = GEOparse.get_GEO(geo=geo_accession, destdir="../../data/geo/") return gse