示例#1
0
 def test_duplicate_column(self):
     columns = ['ID',
                'COL',
                'ROW',
                'NAME',
                'SPOT_ID',
                'CONTROL_TYPE',
                'REFSEQ',
                'GB_ACC',
                'GENE',
                'GENE_SYMBOL',
                'GENE_NAME',
                'UNIGENE_ID',
                'ENSEMBL_ID',
                'TIGR_ID',
                'ACCESSION_STRING',
                'CHROMOSOMAL_LOCATION',
                'CYTOBAND',
                'DESCRIPTION',
                'GO_ID',
                'SEQUENCE',
                'SPOT_ID.1',
                'ORDER']
     columns2 = ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE',
                 'ENSEMBL_ID', 'GB_ACC',
                 'GENE', 'GENE_SYMBOL', 'ENSEMBL_ID.1', 'UNIGENE_ID',
                 'ENSEMBL_ID.2', 'TIGR_ID',
                 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND',
                 'DESCRIPTION', 'GO_ID',
                 'SEQUENCE', 'SPOT_ID.1', 'ORDER']
     gpl = GEO.get_GEO(filepath=join(download_geo, "GPL4133.txt"))
     self.assertEqual(list(gpl.columns.index), columns)
     gpl2 = GEO.get_GEO(filepath=join(download_geo, "GPL4134.txt"))
     self.assertEqual(list(gpl2.columns.index), columns2)
示例#2
0
 def test_get_geo_and_data_with_annotations(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.name, "GPL96")
     self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 21)
示例#3
0
 def test_pivot_samples(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"),
                       geotype="GSE")
     result = read_table(
         join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0)
     result.columns.name = 'name'
     assert_frame_equal(gse.pivot_samples("VALUE"), result)
示例#4
0
 def test_get_geo_and_data(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.name, "GPL96")
     self.assertEqual(gpl.get_accession(), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 16)
示例#5
0
    def test_download_SRA_parallel_by_sra(self):
        geo_id = 'GSE63525'  # Hi-C dataset from Rao et al.

        def filterby(x):
            return 'HIC173' in x.metadata['title'][0] \
                   or 'HIC174' in x.metadata['title'][0] \
                   or 'HIC175' in x.metadata['title'][0]

        destdir = "./TMP_SOFT_parallel_by_sra"
        gse = GEO.get_GEO(geo=geo_id, destdir=destdir)
        downloaded_paths = gse.download_SRA("*****@*****.**",  # some unused e-mail
                                            directory=destdir,
                                            filetype='sra',
                                            filterby=filterby,
                                            silent=True,
                                            keep_sra=True,
                                            nproc=3)
        print(downloaded_paths)
        self.assertTrue(isdir(destdir))
        self.assertEqual(len(downloaded_paths), 3)
        for k in downloaded_paths.keys():
            self.assertTrue(k in gse.gsms.keys())
        for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']:
            self.assertTrue(k in downloaded_paths.keys())
        for k in downloaded_paths.keys():
            for f in downloaded_paths[k]['SRA']:
                self.assertTrue(isfile(f))
示例#6
0
    def test_download_SRA_parallel_by_gsm(self):
        geo_id = 'GSE63525'  # Hi-C dataset from Rao et al.

        def filterby(x):
            return 'HIC173' in x.metadata['title'][0] \
                   or 'HIC174' in x.metadata['title'][0] \
                   or 'HIC175' in x.metadata['title'][0]

        destdir = "./TMP_SOFT_parallel_by_gsm"

        gse = GEO.get_GEO(geo=geo_id, destdir=destdir)
        gsms_to_use = [gsm for gsm in gse.gsms.values() if filterby(gsm)]
        downloaded_paths = dict()
        for gsm in gsms_to_use:
            downloaded_paths[gsm.name] = gsm.download_SRA("*****@*****.**",  # some unused e-mail
                                                          directory=destdir,
                                                          nproc=3,
                                                          return_list=False,
                                                          filetype='sra',
                                                          silent=True,
                                                          keep_sra=True)
        self.assertTrue(isdir(destdir))
        self.assertEqual(len(downloaded_paths), 3)
        for k in downloaded_paths.keys():
            self.assertTrue(k in gse.gsms.keys())
        for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']:
            self.assertTrue(k in downloaded_paths.keys())
        for k in downloaded_paths.keys():
            for f in downloaded_paths[k]['SRA']:
                self.assertTrue(isfile(f))
示例#7
0
 def test_get_geo_and_data(self):
     gsm = GEO.get_GEO(geo="GSM11805", destdir=download_geo)
     self.assertTrue(isinstance(gsm, GSM))
     self.assertEqual(gsm.get_accession(), "GSM11805")
     self.assertEqual(len(gsm.table.index), 22283)
     self.assertEqual(len(gsm.columns), 3)
     self.assertEqual(len(gsm.metadata.keys()), 28)
示例#8
0
 def test_get_geo_and_data(self):
     gds = GEO.get_GEO(geo="GDS507", destdir=download_geo)
     self.assertTrue(isinstance(gds, GDS))
     self.assertEqual(len(gds.table.index), 22645)
     self.assertEqual(len(gds.table.columns), 19)
     self.assertEqual(len(gds.metadata.keys()), 16) # we omit DATABASE and SUBSET ! entries
     self.assertEqual(len(gds.database.metadata.keys()), 5)
     for subset_name, subset in iteritems(gds.subsets):
         self.assertEqual(len(subset.metadata.keys()), 4)
         self.assertTrue(isinstance(subset, GDSSubset))
示例#9
0
 def test_merge_and_average(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     result = read_table(join(download_geo, "test_merged_by_id_and_averaged_by_gb_acc.tab"), index_col=0)
     result = result.ix[sorted(result.index), sorted(result.columns)]  # gse.gsms is a dict so the columns might be in different order
     merged = gse.merge_and_average(gse.gpls[gse.gpls.keys()[0]], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
     merged = merged[sorted(merged.columns)]  # gse.gsms is a dict so the columns might be in different order
     assert_frame_equal(merged, result)
     with self.assertRaises(KeyError):
         gse.merge_and_average("platform", "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
     with self.assertRaises(ValueError):
         gse.merge_and_average(["platform"], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
示例#10
0
 def test_pivot_and_annotate(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     gpl = gse.gpls[next(iter(gse.gpls))]
     result = read_table(join(download_geo, "test_sample_pivoted_by_value_and_annotated_by_gbacc.tab"), index_col=0)
     result.columns.name = 'name'
     pivoted = gse.pivot_and_annotate(values="VALUE", gpl=gpl, annotation_column="GB_ACC")
     assert_frame_equal(result, pivoted)
     assert_frame_equal(gse.pivot_and_annotate(values="VALUE", gpl=gpl.table, annotation_column="GB_ACC"),
                        result)
     with self.assertRaises(TypeError):
         gse.pivot_and_annotate(values="VALUE", gpl="gpl", annotation_column="GB_ACC")
示例#11
0
 def test_annotate(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     gsm = gse.gsms["Triple-Fusion Transfected Embryonic Stem Cells Replicate 1"]
     result = read_table(join(download_geo, "test_gsm_annotated.tab"))
     gpl = gse.gpls[next(iter(gse.gpls))]
     assert_frame_equal(result, gsm.annotate(gpl, annotation_column="GB_ACC"))
     assert_frame_equal(result, gsm.annotate(gpl.table, annotation_column="GB_ACC"))
     with self.assertRaises(TypeError):
         gsm.annotate("platform", annotation_column="GB_ACC")
     gsm.annotate(gpl.table, annotation_column="GB_ACC", in_place=True)
     assert_frame_equal(result, gsm.table)
示例#12
0
 def test_soft_format_gse(self):
     print download_geo
     gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo)
     self.assertTrue(isinstance(gse, GSE))
     self.assertEqual(gse.get_accession(), "GSE1563")
     self.assertEqual(len(gse.gsms.keys()), 62)
     self.assertEqual(len(gse.gpls.keys()), 1)
     self.assertEqual(len(gse.gpls[gse.gpls.keys()[0]].table.index), 12625)
     self.assertEqual(len(gse.gsms[gse.gsms.keys()[0]].table.index), 12625)
     for gsm_name, gsm in gse.gsms.iteritems():
         self.assertEqual(len(gsm.table.index), 12625)
         self.assertTrue(isinstance(gsm, GSM))
     for gpl_name, gpl in gse.gpls.iteritems():
         self.assertEqual(len(gpl.table.index), 12625)
         self.assertTrue(isinstance(gpl, GPL))
示例#13
0
    def test_download_SRA(self):

        gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo)
        self.assertTrue(isinstance(gse, GSE))
        self.assertEqual(gse.get_accession(), "GSE1563")
        self.assertEqual(len(gse.gsms.keys()), 62)
        self.assertEqual(len(gse.gpls.keys()), 1)
        self.assertEqual(len(gse.gpls[next(iter(gse.gpls))].table.index),
                         12625)
        self.assertEqual(len(gse.gsms[next(iter(gse.gsms))].table.index),
                         12625)
        for gsm_name, gsm in iteritems(gse.gsms):
            self.assertEqual(len(gsm.table.index), 12625)
            self.assertTrue(isinstance(gsm, GSM))
        for gpl_name, gpl in iteritems(gse.gpls):
            self.assertEqual(len(gpl.table.index), 12625)
            self.assertTrue(isinstance(gpl, GPL))
示例#14
0
    def test_get_geo_gpl_partially(self):
        partial = [
            "GSM1662787",
            "GSM1662789",
            "GSM1662791",
            "GSM1859499"
        ]

        gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo,
                          include_data=True, partial=partial)
        self.assertTrue(isinstance(gpl, GPL))
        self.assertEqual(gpl.get_accession(), "GPL20082")

        for gsm in gpl.gsms:
            self.assertTrue(gsm in partial)

        self.assertEqual(4, len(gpl.gsms))
示例#15
0
    def test_get_geo_gpl_sequencing(self):
        gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo, include_data=True)
        self.assertTrue(isinstance(gpl, GPL))
        self.assertEqual(gpl.get_accession(), "GPL20082")
        
        samples = [   
            "GSM1662787", 
            "GSM1662788", 
            "GSM1662789", 
            "GSM1662790", 
            "GSM1662791", 
            "GSM1677167", 
            "GSM1859499", 
            "GSM1875285"
            ]

        for sample in samples:
            self.assertTrue(sample in gpl.gsms)
   
        self.assertEqual(6, len(gpl.gses["GSE68087"].gsms))
        self.assertEqual(2, len(gpl.gses["GSE67974"].gsms))
示例#16
0
文件: GUI.py 项目: anton-shikov/HW_6
def GSEA (geo_ID, gene_list):
    gse = GEOparse.get_GEO(geo=geo_ID, destdir="./")
    expression = gse.pivot_samples('VALUE').T
    experiments = {}
    for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
        tmp = {}
        tmp["Type"] = 1 if "control" in row["description"] else 0
        experiments[i] = tmp
    experiments = pd.DataFrame(experiments).T
    counter = 0
    all_genes_set = []
    all_corr_set = []
    genes_corr_set = []
    for gene in expression:
        counter += 1
        if counter <= 3:
            continue
        all_genes_set.append(gene)               
        corr_matrix = np.corrcoef([list(experiments['Type']), list(expression[gene])])
        all_corr_set.append(corr_matrix[0,1])
        if gene in gene_list:
            genes_corr_set.append(corr_matrix[0,1])
    p_value = ks_2samp(genes_corr_set, all_corr_set)[1]
    return(str(p_value))
示例#17
0
 def test_no_table(self):
     try:
         gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM2795971.txt'),
                           geotype='GSM')
     except Exception:
         self.fail("No data in the file error.")
    def handle(self, *args, **options):
        """Refreshes the metadata for all experiments, or experiments from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            experiments = Experiment.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            experiments = Experiment.objects.filter(
                source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                logger.debug("Refreshing metadata for an experiment.",
                             experiment=experiment.accession_code)
                try:
                    if experiment.source_database == "SRA":
                        metadata = SraSurveyor.gather_all_metadata(
                            experiment.samples.first().accession_code)
                        SraSurveyor._apply_metadata_to_experiment(
                            experiment, metadata)

                    elif experiment.source_database == "GEO":
                        gse = GEOparse.get_GEO(
                            experiment.accession_code,
                            destdir="/tmp/management",
                            silent=True,
                        )

                        GeoSurveyor._apply_metadata_to_experiment(
                            experiment, gse)

                    elif experiment.source_database == "ARRAY_EXPRESS":
                        request_url = EXPERIMENTS_URL + experiment.accession_code
                        experiment_request = utils.requests_retry_session(
                        ).get(request_url, timeout=60)
                        try:
                            parsed_json = experiment_request.json(
                            )["experiments"]["experiment"][0]
                        except KeyError:
                            logger.error(
                                "Remote experiment has no Experiment data!",
                                experiment_accession_code=experiment.
                                accession_code,
                                survey_job=self.survey_job.id,
                            )
                            continue
                        ArrayExpressSurveyor._apply_metadata_to_experiment(
                            experiment, parsed_json)

                    experiment.save()

                # If there are any errors, just continue. It's likely that it's
                # just a problem with this experiment.
                except Exception:
                    logger.exception(
                        "exception caught while updating metadata for {}".
                        format(experiment.accession_code))

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)
示例#19
0
    def handle(self, *args, **options):
        """Re-surveys GEO experiments containing samples with incorrect platform information.
        """
        # Check against CDF corrected accessions table to prevent recorrection of the same samples.
        corrected_experiments = CdfCorrectedAccession.objects.all().values(
            "accession_code")

        gse_experiments = Experiment.objects.filter(
            source_database="GEO").exclude(
                accession_code__in=corrected_experiments)

        paginator = Paginator(gse_experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                try:
                    gse = GEOparse.get_GEO(experiment.accession_code,
                                           destdir=GEO_TEMP_DIR,
                                           how="brief",
                                           silent=True)

                    sample_accessions = list(gse.gsms.keys())
                    samples = Sample.objects.filter(
                        accession_code__in=sample_accessions)

                    wrong_platform = False
                    for sample in samples:
                        gpl = gse.gsms[
                            sample.accession_code].metadata["platform_id"][0]
                        internal_accession = get_internal_microarray_accession(
                            gpl)
                        if internal_accession != sample.platform_accession_code:
                            wrong_platform = True
                            break

                    if wrong_platform:
                        if options["dry_run"]:
                            logger.info(
                                "Would have re-surveyed experiment with accession code %s",
                                experiment.accession_code,
                            )
                        else:
                            logger.info(
                                "Re-surveying experiment with accession code %s",
                                experiment.accession_code,
                            )

                            purge_experiment(experiment.accession_code)

                            queue_surveyor_for_accession(
                                experiment.accession_code)

                    current_time = timezone.now()
                    CdfCorrectedAccession(
                        accession_code=experiment.accession_code,
                        created_at=current_time).save()
                except Exception:
                    logger.exception("Caught an exception with %s!",
                                     experiment.accession_code)
                finally:
                    # GEOparse downloads files here and never cleans them up! Grrrr!
                    download_path = GEO_TEMP_DIR + experiment.accession_code + "_family.soft.gz"
                    # It's not a directory, but ignore_errors is useful.
                    try:
                        os.remove(download_path)
                    except Exception:
                        # Don't anything interrupt this, like say,
                        # GEOParse downloading a directory instead of
                        # a file...
                        logger.exception("Failed to delete an archive.")

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
示例#20
0
import GEOparse

cache = "E:/ncbigeo"

gpl_data = GEOparse.get_GEO(geo="GPL77", destdir=cache, silent=True)
import GEOparse
import pandas as pd

working_dir = "/home/yunsunglee/data/second_project/"

target = 'GSE118260'
gse = GEOparse.get_GEO(target, destdir=working_dir)

### Pull phenotype data
info = gse.phenotype_data

### Stack genomic data
for one_person in info.index:
    if one_person == info.index[0]:
        expr = gse.gsms[one_person].table[['ID_REF', 'VALUE']]
        expr.columns = ['ID_REF', one_person]
    else:
        t1 = gse.gsms[one_person].table[['ID_REF', 'VALUE']]
        t1.columns = ['ID_REF', one_person]
        expr = expr.merge(t1, on="ID_REF", how="left")
    print(one_person)

### Save the resulting data in your working directory
feather.write_dataframe(expr, dest=working_dir + target + "_expr.feather")
feather.write_dataframe(info, dest=working_dir + target + "_info.feather")
示例#22
0
    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = (
                "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" +
                experiment_accession_code)
            experiment_object.source_database = "GEO"
            experiment_object.title = gse.metadata.get('title', [''])[0]
            experiment_object.description = gse.metadata.get('summary',
                                                             [''])[0]

            # Source doesn't provide time information, assume midnight.
            submission_date = gse.metadata["submission_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_first_published = dateutil.parser.parse(
                submission_date)
            last_updated_date = gse.metadata["last_update_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_last_updated = dateutil.parser.parse(
                last_updated_date)

            unique_institutions = list(set(gse.metadata["contact_institute"]))
            experiment_object.submitter_institution = ", ".join(
                unique_institutions)
            experiment_object.pubmed_id = gse.metadata.get("pubmed_id",
                                                           [""])[0]

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id)

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata['organism_ch1'][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    'data_processing', None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata['title'][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[sample_object.title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    'supplementary_file', [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if '.cel' in lower_file_url \
                    or ('_non_normalized.txt' in lower_file_url) \
                    or ('_non-normalized.txt' in lower_file_url) \
                    or ('-non-normalized.txt' in lower_file_url) \
                    or ('-non_normalized.txt' in lower_file_url):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = supplementary_file_url.split('/')[-1]
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=True)[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = 'MICROARRAY'
                        sample_object.manufacturer = 'AFFYMETRTIX'
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != 'RNA-SEQ':
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                'supplementary_file', []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split('/')[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True)[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if ('_non_normalized.txt' in lower_supplement_url) \
            or ('_non-normalized.txt' in lower_supplement_url) \
            or ('-non-normalized.txt' in lower_supplement_url) \
            or ('-non_normalized.txt' in lower_supplement_url):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0:
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split('/')[-1],
            has_raw=sample_object.has_raw,
            is_archive=True)[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0:
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples
示例#23
0
文件: functions.py 项目: wynstep/SMAC
def DownloadGEODataset(gse, gseDir):
    # Query for retriecing GEO data
    gseData = GEOparse.get_GEO(geo=gse,
                               destdir=gseDir,
                               how='full',
                               annotate_gpl=True,
                               include_data=True,
                               silent=True)
    # Initialise data containers
    exprData = pd.DataFrame(columns=["ID_REF"])
    metaData = pd.DataFrame()
    exprDataMapped = []
    exprDataFiles = []
    metaDataFiles = []

    # Iterating over all GSM samples in GSE and collect data JUST if RNA-seq
    for gsmName, gsm in gseData.gsms.items():
        if gsm.metadata['type'][0] == 'RNA' and len(
                gsm.table
        ) > 0:  # We are looking at RNA-seq data with stored data
            ##----------------------##
            ##	Expression			##
            ##----------------------##
            # extract expression data into a pandas dataframe
            tmpExpr = pd.DataFrame({
                "ID_REF": list(gsm.table['ID_REF']),
                gsmName: list(gsm.table['VALUE'])
            })
            # Appending to exprData (by column)
            exprData = tmpExpr if exprData.shape[0] == 0 else exprData.merge(
                tmpExpr, how='outer', on='ID_REF')

            ##----------------------##
            ##	Metadata			##
            ##----------------------##
            # extract metadata into a pandas dataframe
            tmpMetadata = pd.DataFrame(gsm.metadata.items(),
                                       columns=['MetaData', gsmName])
            # manipulate metaData for being used by subsequent analyses
            tmpMetadata = tmpMetadata.set_index(
                'MetaData')  # make metadata column as row name
            tmpMetadata = tmpMetadata.transpose()
            # Appending to metaData (by column)
            metaData = tmpMetadata if metaData.shape[
                0] == 0 else metaData.append(tmpMetadata)

        # We'll map the platform data JUST if we have data collected in the exprData
    if exprData.shape[0] > 0:
        ##----------------------##
        ##	platform			##
        ##----------------------##
        # Here we aim at mapping the ID_REF to the gene name
        for gplName, gpl in gseData.gpls.items():
            # Get name of column containing gene symbol and ID information
            idCol = [
                col for col in gpl.table.columns
                if re.search("^ID", col, re.IGNORECASE)
            ]
            gsCol = [
                col for col in gpl.table.columns
                if re.search("^gene(.+|)(symbol|name|id)", col, re.IGNORECASE)
            ]
            # Check if gsCol is present
            if len(gsCol) > 0 and len(idCol) > 0:
                platformData = pd.DataFrame({
                    "ID_REF":
                    list(gpl.table[idCol[0]]),
                    "geneName":
                    list(gpl.table[gsCol[0]]),
                })
                ##----------------------------------##
                ##	Map genes to gene platforms		##
                ##----------------------------------##
                exprDataMapped.append(
                    pd.merge(platformData, exprData, how="inner", on="ID_REF"))
            else:
                exprDataMapped.append(exprData)

    ##-----------------------------------------------##
    ##	Save datasets into files -- or delete folder ##
    ##-----------------------------------------------##

    # Iterate over mapped expression data
    if len(exprDataMapped) > 0:
        for index, eData in enumerate(exprDataMapped):
            if eData.shape[0] > 0 and metaData.shape[0] > 0:
                exprDataFile = "{0}/{1}.exprs.{2}.tsv".format(
                    gseDir, gse, index)
                metaDataFile = "{0}/{1}.meta.{2}.tsv".format(
                    gseDir, gse, index)
                eData.to_csv(exprDataFile,
                             sep='\t',
                             index=False,
                             encoding='utf-8')
                metaData.to_csv(metaDataFile,
                                sep='\t',
                                index=True,
                                encoding='utf-8')
                exprDataFiles.append(exprDataFile)
                metaDataFiles.append(metaDataFile)

        # Clean soft.gz files
        subprocess.Popen("rm {0}/*.soft.gz".format(gseDir),
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE).communicate()
        # Return metadata and expression files
        return exprDataFiles, metaDataFiles
示例#24
0
import GEOparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

probes_conv = GEOparse.parse_GSM( \
    "/home/cyril/Documents/Master/sem_1/Case_study/module1/data/GPL6457_old_annotations.txt.gz")

gse = GEOparse.get_GEO("GSE24616", destdir="./")
# gse = GEOparse.get_GEO(filepath="./GSM606890.TXT.GZ.soft")

char = {"stage": [], "time": [], "sex": [], "sample_name": []}
for gsm_name, gsm in sorted(gse.gsms.iteritems()):
    char["stage"].append(gsm.metadata['characteristics_ch1'][1].split(": ")[1])
    char["time"].append(gsm.metadata['characteristics_ch1'][2].split(": ")[1])
    char["sex"].append(gsm.metadata['characteristics_ch1'][3].split(": ")[1])
    char["sample_name"].append(gsm.name)

print(char["stage"][3], char["time"][3], char["sex"][3],
      char["sample_name"][3])

GPL = gse.gpls.values()[0]
pivoted_samples = gse.pivot_samples('VALUE')
pivoted_samples.set_index(GPL.table.SPOT_ID, inplace=True)

# pivoted_samples.hist()

strata = pd.read_csv("../phylostrata.txt", sep="\t", header=None)
strata.columns = ["GeneID", "ProbeID", "age"]
strata.set_index("ProbeID", inplace=True)
示例#25
0
 def test_name(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "GSE105845_family.soft"),
                       geotype="GSE")
     self.assertEqual(gse.name, "GSE105845")
示例#26
0
def fetch_data(gse_acc):
    mkdir("./" + gse_acc + "/")
    gse = GEOparse.get_GEO(geo=gse_acc, destdir="./" + gse_acc + "/")
    #pivoted_control_samples = gse.pivot_samples('VALUE')

    return gse
示例#27
0
import numpy as np
from pathlib import Path
import GEOparse
import pandas as pd
from matplotlib import pyplot as plt

path = Path(
    "C:/Users/Daniel/Desktop/Andi project/database/Soft/GSE64392_family.soft")
num_top_n = 1000

gse = GEOparse.get_GEO(filepath=str(path))

cntr = 1
for gsm_name, gsm in gse.gsms.items():
    print(f"Analysing dataset {cntr}/{len(gse.gsms)}")

    temp_df = pd.DataFrame(data=gsm.table["VALUE"].values.transpose(),
                           index=gsm.table["ID_REF"],
                           columns=[gsm_name])

    if cntr == 1:
        joined_df = temp_df.copy()

    else:
        joined_df = joined_df.join(temp_df)

    cntr += 1

sorted_means = joined_df.mean(axis=1).sort_values()[:num_top_n]

gpl_table = gse.gpls[list(gse.gpls.keys())[0]].table
示例#28
0
#
#for j in range(1,144):
#    page = requests.get("https://ftp.ncbi.nih.gov/geo/series/GSE"+str(j)+'nnn'+"/")
#    tree = html.fromstring(page.content)
#    r = tree.xpath('//a/text()')
#    #print(type(r))
#    del r[0]
#    num.append(len(r))
#    for i in r:
#        nr.append(i.replace("/",""))
#print("array 생성완료 ")

#nr = pd.read_csv('./nr.tsv', sep='\t').values.tolist()
nd = pd.read_csv('nd.tsv', sep='\t').values.tolist()
for i in nd[96000:]:
    gse = GEOparse.get_GEO(geo=i[0], destdir="./geoData2")

#return filepath, getype

#print()
#print("GSM example:")
#for gsm_name,gsm in gse.gsms.items():
#    print("Name: ", gsm_name)
#    print("Metadata:",)
#    for key, value in gsm.metadata.items():
#        print(" - %s : %s" % (key, ", ".join(value)))
#    print ("Table data:",)
#    print (gsm.table.head())
#    break

#print()
示例#29
0
#
#for j in range(1,144):
#    page = requests.get("https://ftp.ncbi.nih.gov/geo/series/GSE"+str(j)+'nnn'+"/")
#    tree = html.fromstring(page.content)
#    r = tree.xpath('//a/text()')
#    #print(type(r))
#    del r[0]
#    num.append(len(r))
#    for i in r:
#        nr.append(i.replace("/",""))
#print("array 생성완료 ")

#nr = pd.read_csv('./nr.tsv', sep='\t').values.tolist()
nd = pd.read_csv('nd.tsv',sep='\t').values.tolist()
for i in nd[50000:62000]:
	gse = GEOparse.get_GEO(geo=i[0], destdir="/drive/My Drive/geoData4")
	print(str(i))

#return filepath, getype

#print()
#print("GSM example:")
#for gsm_name,gsm in gse.gsms.items():
#    print("Name: ", gsm_name)
#    print("Metadata:",)
#    for key, value in gsm.metadata.items():
#        print(" - %s : %s" % (key, ", ".join(value)))
#    print ("Table data:",)
#    print (gsm.table.head())
#    break
示例#30
0
 def load_series(self, geo_id: str) -> GSE:
     return GEOparse.get_GEO(geo_id, destdir=self.temp)
示例#31
0
文件: geo.py 项目: erflynn/refinebio
    def set_platform_properties(self, sample_object: Sample,
                                sample_metadata: Dict,
                                gse: GEOparse.GSM) -> Sample:
        """Sets platform-related properties on `sample_object`.

        Uses metadata from `gse` to populate platform_name,
        platform_accession_code, and technology on `sample_object`.
        """

        # Determine platform information
        external_accession = get_normalized_platform(
            gse.metadata.get("platform_id", [UNKNOWN])[0])

        if external_accession == UNKNOWN:
            sample_object.platform_accession_code = UNKNOWN
            sample_object.platform_name = UNKNOWN
            sample_object.manufacturer = UNKNOWN
            # If this sample is Affy, we potentially can extract the
            # platform information from the .CEL file. If it's not we
            # can't do anything. Therefore assume the technology is
            # microarray when we have no platform information.
            sample_object.technology = "MICROARRAY"
            return sample_object

        platform_accession_code = UNKNOWN

        gpl = GEOparse.get_GEO(external_accession,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        platform_title = gpl.metadata.get("title", [UNKNOWN])[0]

        # Check if this is a supported microarray platform.
        for platform in get_supported_microarray_platforms():
            if platform["external_accession"] == external_accession:
                platform_accession_code = platform["platform_accession"]

        if platform_accession_code != UNKNOWN:
            # It's a supported microarray platform.

            # We are using the brain array package as the platform accession code,
            # so, for instance, GPL3213 becomes 'chicken'.
            sample_object.platform_accession_code = platform_accession_code
            sample_object.technology = "MICROARRAY"
            try:

                # Related: https://github.com/AlexsLemonade/refinebio/issues/354
                # If it's Affy we can get a readable name:
                sample_object.platform_name = get_readable_affymetrix_names(
                )[platform_accession_code]
                sample_object.manufacturer = "AFFYMETRIX"

                # Sometimes Affymetrix samples have weird channel
                # protocol metadata, so if we find that it's
                # Affymetrix return it now. Example: GSE113945
                return sample_object
            except KeyError:
                # Otherwise we'll use what we've got.
                sample_object.platform_name = platform_title

            # Determine manufacturer

            platform = sample_object.pretty_platform.upper()
            if "AGILENT" in platform:
                sample_object.manufacturer = "AGILENT"
            elif "ILLUMINA" in platform or "NEXTSEQ" in platform:
                sample_object.manufacturer = "ILLUMINA"
            elif "AFFYMETRIX" in platform:
                sample_object.manufacturer = "AFFYMETRIX"
            else:
                sample_object.manufacturer = UNKNOWN

            return sample_object

        # Check to see if this is a supported RNASeq technology:

        # GEO RNASeq platform titles often have organisms appended to
        # an otherwise recognizable platform. The list of supported
        # RNASeq platforms isn't long, so see if any of them are
        # contained within what GEO gave us.
        # Example: GSE69572 has a platform title of:
        # 'Illumina Genome Analyzer IIx (Glycine max)'
        # Which should really just be 'Illumina Genome Analyzer IIx'
        # because RNASeq platforms are organism agnostic.  However,
        # the platforms 'Illumina Genome Analyzer' and 'Illumina
        # Genome Analyzer II' would also be matched, so make sure that
        # the longest platform names are tested first:
        sorted_platform_list = get_supported_rnaseq_platforms().copy()
        sorted_platform_list.sort(key=len, reverse=True)

        for platform in sorted_platform_list:
            if platform.upper() in platform_title.upper():
                sample_object.technology = "RNA-SEQ"
                sample_object.platform_name = platform
                # We just use RNASeq platform titles as accessions
                sample_object.platform_accession_code = platform

                if "ILLUMINA" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ILLUMINA"
                elif "NEXTSEQ" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "NEXTSEQ"
                elif "ION TORRENT" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ION_TORRENT"
                else:
                    sample_object.manufacturer = UNKNOWN

                return sample_object

        # If we've made it this far, we don't know what this platform
        # is, therefore we can't know what its technology is. What we
        # do know is what GEO said was it's platform's accession and
        # title are, and that it's unsupported.
        sample_object.platform_name = platform_title
        sample_object.platform_accession_code = external_accession
        sample_object.technology = UNKNOWN
        sample_object.manufacturer = UNKNOWN

        return sample_object
示例#32
0
import GEOparse

#i=1
for i in range(13037, 14000):
    gse = GEOparse.get_GEO(geo="GSE" + str(i), destdir="./geoData")
#return filepath, getype

#print()
#print("GSM example:")
#for gsm_name,gsm in gse.gsms.items():
#    print("Name: ", gsm_name)
#    print("Metadata:",)
#    for key, value in gsm.metadata.items():
#        print(" - %s : %s" % (key, ", ".join(value)))
#    print ("Table data:",)
#    print (gsm.table.head())
#    break

#print()
#print("GPL example:")
#for gpl_name, gpl in gse.gpls.items():
#    print("Name: ", gpl_name)
#    print("Metadata:",)
#    for key, value in gpl.metadata.items():
#        print(" - %s : %s" % (key, ", ".join(value)))
#    print("Table data:",)
#    print(gpl.table.head())
#    break
示例#33
0
def download_and_parse_geo_data(
        geo_id,
        directory_path=getcwd(),
):

    print('Establishing {} @ {} ...'.format(
        geo_id,
        directory_path,
    ))

    gse = GEOparse.get_GEO(
        geo=geo_id,
        destdir=directory_path,
    )

    print('Title: {}'.format(gse.get_metadata_attribute('title')))

    print('N sample: {}'.format(len(gse.get_metadata_attribute('sample_id'))))

    geo_dict = {
        'id_x_sample': None,
        'id_gene_symbol': None,
        'gene_x_sample': None,
        'information_x_sample': None,
    }

    values = []

    for sample_id, gsm in gse.gsms.items():

        print('{} ...'.format(sample_id))

        sample_table = gsm.table

        if sample_table.empty:

            raise ValueError(
                'Sample {} has empty table (perhaps this is a single cell experiment.)'
                .format(gsm.name))

        sample_table.columns = sample_table.columns.str.lower().str.replace(
            ' ',
            '_',
        )

        sample_values = sample_table.set_index('id_ref').squeeze()

        sample_values.name = sample_id

        if isinstance(
                sample_values,
                DataFrame,
        ):

            sample_values.columns = ('{} ({})'.format(
                sample_id,
                column,
            ) for column in sample_values.columns)

        values.append(sample_values)

    geo_dict['id_x_sample'] = concat(
        values,
        axis=1,
    ).sort_index().sort_index(axis=1)

    print('id_x_sample.shape: {}'.format(geo_dict['id_x_sample'].shape))

    id_gene_symbol = None

    for platform_id, gpl in gse.gpls.items():

        print('{} ...'.format(platform_id))

        platform_table = gpl.table

        platform_table.columns = platform_table.columns.str.lower(
        ).str.replace(
            ' ',
            '_',
        )

        platform_table.set_index(
            'id',
            inplace=True,
        )

        if 'gene_symbol' not in platform_table.columns:

            if 'gene_assignment' in platform_table.columns:

                gene_symbols = []

                for assignment in platform_table['gene_assignment']:

                    if not isna(assignment) and '//' in assignment:

                        gene_symbols.append(
                            assignment.split(sep='//')[1].strip())

                    else:

                        gene_symbols.append('NO GENE NAME')

                platform_table['gene_symbol'] = gene_symbols

            elif 'oligoset_genesymbol' in platform_table.columns:

                platform_table['gene_symbol'] = platform_table[
                    'oligoset_genesymbol']

            elif 'ilmn_gene' in platform_table.columns:

                platform_table['gene_symbol'] = platform_table['ilmn_gene']

            elif 'gene' in platform_table.columns:

                platform_table['gene_symbol'] = platform_table['gene']

        if 'gene_symbol' in platform_table:

            id_gene_symbol = platform_table['gene_symbol'].dropna()

            id_gene_symbol.index = id_gene_symbol.index.astype(str)

            geo_dict['id_gene_symbol'] = id_gene_symbol

            print('id_gene_symbol.shape:{}'.format(id_gene_symbol.shape))

            print('N valid gene_symbol: {}'.format(
                (id_gene_symbol != 'NO GENE NAME').sum()))

            gene_x_sample = geo_dict['id_x_sample'].copy()

            id_gene_symbol = id_gene_symbol.to_dict()

            gene_x_sample.index = geo_dict['id_x_sample'].index.map(
                lambda index: id_gene_symbol.get(
                    str(index),
                    'NO GENE NAME',
                ))

            gene_x_sample.drop(
                'NO GENE NAME',
                inplace=True,
                errors='ignore',
            )

            gene_x_sample.index.name = 'gene_symbol'

            geo_dict['gene_x_sample'] = gene_x_sample.sort_index().sort_index(
                axis=1)

            print('gene_x_sample.shape: {}'.format(
                geo_dict['gene_x_sample'].shape))

        else:

            print(
                '\tgene_symbol is not a GPL column ({}); IDs may be already gene symbols.'
                .format(', '.join(platform_table.columns)))

        geo_dict['information_x_sample'] = gse.phenotype_data.T

        print('information_x_sample.shape: {}'.format(
            geo_dict['information_x_sample'].shape))

    return geo_dict
示例#34
0
 def test_get_geo_and_data(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.get_accession(), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 16)
def micro_analysis(accession_id, control_samples, treated_samples):
    #Creating a dictionary of assigned control and treated samples

    control_samples = {i: 'control' for i in control_samples}
    treated_samples = {i: 'treated' for i in treated_samples}
    all_samples = merge(control_samples, treated_samples)

    #Parse the GEO data using the Accession ID
    gse = GEOparse.get_GEO(geo=accession_id, destdir="./")
    #Create a list of samples to use in the development of the expression matrix
    list_samples = list(all_samples.keys())

    #Visualization of expression matrix
    pivoted_samples = gse.pivot_samples('VALUE')[list_samples]
    pivoted_samples.head()
    #Determine the total amount of probes used in the study
    pivoted_samples_average = pivoted_samples.median(axis=1)
    #Filtering out unexpressed probes
    expression_threshold = pivoted_samples_average.quantile(0.3)
    expressed_probes = pivoted_samples_average[
        pivoted_samples_average >= expression_threshold].index.tolist()

    #Redefine expression data using only the expressed probes
    exprsdata = gse.pivot_samples("VALUE").loc[expressed_probes]
    exprsdata = exprsdata.T
    #Deletes additional samples that aren't being analyzed
    exprsdata = exprsdata[exprsdata.index.isin(list_samples)]
    #Drop any probe columns where expression data is missing or negative
    exprsdata.dropna(axis=1)

    #Quantile normalization of data
    rank_mean = exprsdata.stack().groupby(
        exprsdata.rank(method='first').stack().astype(int)).mean()
    exprsdata.rank(
        method='min').stack().astype(int).map(rank_mean).unstack().dropna(
            axis=1)
    #Making Dataframe of samples
    samplesDf = pd.DataFrame.from_dict(all_samples,
                                       orient='index',
                                       columns=['type'])
    samplesDf.reset_index(inplace=True)

    #Transpose data matrix for sorting, index correlated to probe IDs
    exprsdata = exprsdata.T
    #Upload annotation file as dictionary

    #Reset index and replace with gene symbols, view as dataframe
    exprsdata = pd.DataFrame(exprsdata)
    exprsdata.index = exprsdata.index.astype(str, copy=False)
    exprsdata['symbol'] = exprsdata.index.to_series().map(PROBE2GENE)
    exprsdata.reset_index(inplace=True)
    data = exprsdata.set_index('symbol')

    #Drop probe id column
    data = data.drop('ID_REF', axis=1)
    #Drop rows that aren't associated with a particular gene symbol
    data = data.reset_index().dropna().set_index('symbol')

    #Utilize warning statements
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    #Make sample classes, ensure that there is a distinction between control/treated samples
    data_cd = {}
    sample_classes = {}
    sample_class = np.zeros(data.shape[1], dtype=np.int32)
    sample_class[samplesDf['type'].values == 'control'] = 1
    sample_class[samplesDf['type'].values == 'treated'] = 2
    sample_classes = sample_class

    #CD results
    cd_res = chdir(data.values,
                   sample_classes,
                   data.index,
                   gamma=.5,
                   sort=False,
                   calculate_sig=False)
    cd_coefs = np.array(list(map(lambda x: x[0], cd_res)))
    srt_idx = np.abs(cd_coefs).argsort()[::-1]
    cd_coefs = cd_coefs[srt_idx][:600]
    sorted_DEGs = data.index[srt_idx][:600]
    up_genes = dict(zip(sorted_DEGs[cd_coefs > 0], cd_coefs[cd_coefs > 0]))
    dn_genes = dict(zip(sorted_DEGs[cd_coefs < 0], cd_coefs[cd_coefs < 0]))
    data_cd['up'] = up_genes
    data_cd['dn'] = dn_genes

    #Retrieve up and down gene sets
    up_list = list(up_genes.keys())
    dn_list = list(dn_genes.keys())
    #Up genes and down genes
    return up_list, dn_list
示例#36
0
 def test_get_geo_and_data_with_annotations(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 21)
示例#37
0
# -*- coding: utf-8 -*-
"""
"C:\Users\Aditya\Desktop\new_dataset"

Created on Thu Nov 21 01:34:39 2019

@author: Aditya
"""
#For single GSE SOFT file parsing in GEOParse
import GEOparse

gse = GEOparse.get_GEO(filepath = r'C:\Users\Aditya\Desktop\GSE15824_family.soft.gz', destdir = r'E:\CodingProjects')
for name, extra in gse.gsms.items():
    name = name.strip('\n')
    print('\t\t'+name+' transferred\n')
    gse.gsms[name].table.to_csv(name+'.txt', index = None, sep = '\t', mode = 'w')

#For multiple files, store list of GSE files you have downloaded
#in a text file and read it sequentially to access all the GSMs

#Storing list of files in variable putingeo
    
f = open(r'fetchtheseGSEfromGEO.txt','r')
putingeo = f.readlines() 
f.close()

import GEOparse
import os
num = len(putingeo)
for i in putingeo:
    print (str(num)+' files remaining')
示例#38
0
 def test_pivot_samples(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     result = read_table(join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0)
     result.columns.name = 'name'
     assert_frame_equal(gse.pivot_samples("VALUE"), result)
示例#39
0
 def test_empty_line(self):
     try:
         gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM32878.txt'),
                           geotype='GSM')
     except IndexError:
         self.fail("Empty line in the file causes an error.")
示例#40
0
import numpy as np
from pathlib import Path
import GEOparse
import pandas as pd
from matplotlib import pyplot as plt

path = Path("C:/Users/Daniel/Desktop/Andi project/database/Soft/GPL16686_family.soft.gz")

gse = GEOparse.get_GEO(filepath=str(path), partial=['GPL16686'])

gpl_table = gse.gpls[list(gse.gpls.keys())[0]].table

print('Finished!')
示例#41
0
 def test_name(self):
     gpl = GEO.get_GEO(filepath=join(download_geo, "GPL20814_family.soft"),
                       geotype="GPL")
     self.assertEqual(gpl.name, "GPL20814")
示例#42
0
文件: geo.py 项目: erflynn/refinebio
    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id,
            )
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse)
            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id,
                )

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata["organism_ch1"][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    "data_processing", None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata["title"][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                GeoSurveyor._apply_harmonized_metadata_to_sample(
                    sample_object, harmonized_samples[sample_object.title])

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    "supplementary_file", [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if (".cel" in lower_file_url
                            or ("_non_normalized.txt" in lower_file_url)
                            or ("_non-normalized.txt" in lower_file_url)
                            or ("-non-normalized.txt" in lower_file_url)
                            or ("-non_normalized.txt" in lower_file_url)):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = FileUtils.get_filename(supplementary_file_url)
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=FileUtils.is_archive(filename),
                    )[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = "MICROARRAY"
                        sample_object.manufacturer = "AFFYMETRIX"
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != "RNA-SEQ":
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                "supplementary_file", []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split("/")[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True,
            )[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if (("_non_normalized.txt" in lower_supplement_url)
                    or ("_non-normalized.txt" in lower_supplement_url)
                    or ("-non-normalized.txt" in lower_supplement_url)
                    or ("-non_normalized.txt" in lower_supplement_url)):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if (OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0):
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split("/")[-1],
            has_raw=sample_object.has_raw,
            is_archive=True,
        )[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if (OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0):
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples
def load_meta(datasets):
    n_valid = 0
    qc_idx = []
    ages = []
    injured = []
    cell_types = []

    # Load metadata for each cell.

    id_to_meta = {}
    soft_fnames = [
        'data/microglia/masuda2019/mouse/GSE120744_family.soft.gz',
    ]
    for fname in soft_fnames:
        gsms = GEOparse.get_GEO(filepath=fname, silent=True).gsms
        for geo_id in gsms:
            cell_id = gsms[geo_id].metadata['title'][0]
            meta = {
                attr.split(':')[0].strip(): attr.split(':')[1].strip()
                for attr in gsms[geo_id].metadata['characteristics_ch1']
            }
            id_to_meta[cell_id] = meta

    # Map cell ids to important attributes.

    for i in range(len(datasets)):
        with gzip.open(data_names[i] + '.tsv.gz') as f:
            cell_ids = f.readline().decode('utf-8').rstrip().split()[1:]

        for cell_id in cell_ids:
            meta = id_to_meta[cell_id]

            age_str = meta['age']
            if age_str == '16 weeks':
                age = 16 * 7
                age_str = 'P{}'.format(age)
            elif age_str == 'embryonal':  # Sic.
                age = 16.5
                age_str = 'E{}'.format(age)
            elif age_str == '03_w':
                age = 3 * 7
                age_str = 'P{}'.format(age)
            elif age_str == '16_w':
                age = 16 * 7
                age_str = 'P{}'.format(age)
            else:
                continue
            if age_str.startswith('P'):
                min_age = 19.
                max_age = 60.
                age = 19 + ((age - min_age) / (max_age - min_age) * 3)
            ages.append(age)

            if 'treatment' in meta:
                if 'demyelination' in meta['treatment']:
                    inj = 'demyelination'
                elif 'remyelination' in meta['treatment']:
                    inj = 'remyelination'
                elif 'Facial_nerve_axotomy' in meta['treatment']:
                    inj = 'fxn'
                else:
                    inj = 'none'
            else:
                inj = 'none'
            injured.append(inj)

            cell_types.append('{}_{}'.format(age_str, inj))

            qc_idx.append(n_valid)
            n_valid += 1

    return qc_idx, np.array(cell_types), np.array(ages), np.array(injured)
示例#44
0
import GEOparse
from tissuespecific.reconstruction import Builder

#set paths
path = '/home/acabbia/Documents/Muscle_Model/models'
ref_model = path + "/recon2.2.xml"
output_folder = path + "/library_GEO_GSE25941_v2/"

# import reference model (RECON2.2)
recon22 = cobra.io.read_sbml_model(ref_model)

#Gene expression data GEO ID (Raue 2012)
GEO_accession_nr = "GSE25941"

#get data from GEO
serie = GEOparse.get_GEO(geo=GEO_accession_nr)

gsm = 'GSM637527'
#build translator dict
table = serie.gsms[gsm].table
translator = Builder.affyprobe_translator(table, 'hgnc_id')
#Build confidence dict
confidence = Builder.rxn_confidence_2(recon22, table, translator, 'hgnc_id')
##########################################################################################################################
#%%

# reactionsto be added
add = [
    'ATPS4m', 'ENO', 'PDHm', 'PYK', 'G3PD1', 'G6PDH2r', 'AKGDm', 'CYOOm3',
    'r0913', 'GLCt2_2', 'EX_glc(e)', 'EX_fru(e)', 'EX_ppa(e)', 'EX_but(e)',
    'EX_hdca(e)', 'EX_ocdca(e)', 'EX_arach(e)', 'EX_doco13ac_', 'EX_lgnc(e)',
示例#45
0
    def run(self, inputs, outputs):
        """Run the analysis."""

        if not re.match(r"(GSE\d{1,8})", inputs.gse_accession):
            self.error(
                f"GEO series accessions (GSE) are supported but {inputs.gse_accession} was provided."
            )

        try:
            gse = GEOparse.get_GEO(geo=inputs.gse_accession, destdir="./")
        except IOError:
            self.error(
                f"Download of {inputs.gse_accession} failed. ID could be incorrect or the data might not be "
                "public yet.")
        except Exception as err:
            self.error(
                f"Download of {inputs.gse_accession} failed. GEO parse failed with {err}"
            )

        supported = [
            "Expression profiling by high throughput sequencing",
            "Expression profiling by array",
        ]

        gse_type = gse.get_type() if type(gse.get_type()) is list else [
            gse.get_type()
        ]
        if set(gse_type).intersection(set(supported)):
            if "SuperSeries of" in gse.relations:
                # This is a mixed GSE series which needs to be unpacked.
                super_series = [
                    GEOparse.get_GEO(geo=accession, destdir="./")
                    for accession in gse.relations["SuperSeries of"]
                ]
            else:
                super_series = [gse]
        else:
            self.error(
                f"No supported series types found. Got {', '.join(gse_type)} but only {' and '.join(supported)} "
                "are supported.")

        metadata_tables = {}
        for series in super_series:
            series_type = series.get_type()
            if series_type == "Expression profiling by high throughput sequencing":
                run_info = self.upload_rna_gse(inputs, series)
                metadata_tables[series.name] = create_metadata(
                    series, run_info)
            elif series_type == "Expression profiling by array":
                run_info = self.upload_ma_gse(inputs, series)
                metadata_tables[series.name] = create_metadata(
                    series, run_info)
            else:
                self.warning(
                    f"The upload of {series_type} is currently not supported. Samples from {series.name} will be "
                    "skipped.")
        meta_file = f"{inputs.gse_accession}_metadata.tsv"
        metadata = pd.concat(metadata_tables.values(),
                             join="outer",
                             ignore_index=False)
        metadata.to_csv(meta_file, sep="\t", index=False)
        self.run_process("upload-orange-metadata", {"src": meta_file})

        for entity_name in metadata["mS#Sample name"].values:
            objects = Data.filter(entity__name=entity_name)
            if len(objects) > 1:
                self.warning(
                    f"Multiple samples with entity name {entity_name} are present, descriptor will be added only "
                    "to the last one")
            obj = objects[-1]
            obj.entity.descriptor = construct_descriptor(
                metadata, obj.entity_name)
示例#46
0
import GEOparse
import pandas as pd
import numpy as np
from functools import *
import re

gse1 = GEOparse.get_GEO(filepath="./Data/Human/GSE2508_family.soft.gz")

plats_1 = list(gse1.gpls.keys())

samples1 = gse1.phenotype_data[["platform_id", "title"]]
sample1 = samples1.groupby(["platform_id"]); sample1.groups
d = {}
for l in plats_1:
    ls = "".join(list(sample1.get_group(l)['title']))
    lf = re.findall("Lean F", ls)
    of = re.findall("Obese F", ls)
    lm = re.findall("Lean M", ls)
    om = re.findall("Obese M", ls)
    d[l] = {"LF": len(lf), "OF": len(of), "LM": len(lm), "OM": len(om)}

x = samples1.copy()
x["samples"] = x.index
x["title"] = x['title'].apply(lambda x: x[:-len(x.split()[-1])].strip()).to_frame('samples')
x['gender'] = x['title'].map(lambda x: x.split(' ')[1])
x['cbmi'] = x['title'].map(lambda x: x.split(' ')[0].lower())

grouped = x.groupby("title")
l = pd.DataFrame.from_dict(grouped.groups)

y = x[["title", "gender", "cbmi"]]
示例#47
0
def get_geo_database(geo_dataset_id):
    return GEOparse.get_GEO(geo=geo_dataset_id).table
示例#48
0
import GEOparse
import attractors

gse = GEOparse.get_GEO(filepath="./gse_soft_files/GSE61470_family.soft")

for gsm_name, gsm in gse.gsms.items():
    print("Name: ", gsm_name)
    print("Metadata:", )
    for key, value in gsm.metadata.items():
        print(" - %s : %s" % (key, ", ".join(value)))
    print("Table data:", )
    print(gsm.table.head())
    print(gsm.columns)

for gpl_name, gpl in gse.gpls.items():
    print("Name: ", gpl_name)
    print("Metadata:", )
    for key, value in gpl.metadata.items():
        print(" - %s : %s" % (key, ", ".join(value)))
    print("Table data:", )
    print(gpl.table.head())
    print(gpl.columns)
示例#49
0
def download_soft(gse_acc):
    mkdir("./" + gse_acc + "/")
    GEOparse.get_GEO(geo=gse_acc, destdir="./" + gse_acc + "/")
示例#50
0
import GEOparse
import pandas as pd

gse2 = GEOparse.get_GEO(filepath="./Data/Human/GSE26637_family.soft.gz")

plats_2 = list(gse2.gpls.keys())[0]

samples2 = gse2.phenotype_data[[
    "characteristics_ch1.0.gender", "characteristics_ch1.2.stimulation",
    "characteristics_ch1.3.resistance status"
]]
samples2 = samples2.rename(
    columns={
        'characteristics_ch1.0.gender': 'gender',
        'characteristics_ch1.2.stimulation': 'fasting_status',
        'characteristics_ch1.3.resistance status': 'insulin_status'
    })
samples2['cbmi'] = samples2['insulin_status'].map(
    lambda x: 'lean' if x == 'sensitive' else 'obese')

samples2.to_pickle('./Preprocessed_Data/Human/batch2_pheno.p')
with open('./Preprocessed_Data/Human/batch2_pheno.txt', 'w') as handle:
    samples2.to_csv(handle, sep='\t')

samples2_exprs = gse2.pivot_samples('VALUE')[list(samples2.index)]

samples2_ann = samples2_exprs.reset_index().merge(
    gse2.gpls['GPL570'].table[["ID", "Gene Symbol"]],
    left_on='ID_REF',
    right_on="ID").set_index('ID_REF')
samples2_ann.drop('ID', inplace=True, axis=1)
示例#51
0
def download(geo_accession):
    if not os.path.exists("../../data/geo/"):
        os.makedirs("../../data/geo/")
    gse = GEOparse.get_GEO(geo=geo_accession, destdir="../../data/geo/")
    return gse