def __init__(self, gds_name, remove_unknown=None): """ Retrieval of a specific GEO DataSet as a :obj:`Orange.data.Table`. Constructor returns the object that can retrieve GEO DataSet (samples and gene expressions). It first checks a local cache directory if the particular data file is loaded locally, else it downloads it from `NCBI's GEO FTP site <ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/>`_. :param gds_name: An NCBI's ID for the data set in the form "GDSn" where "n" is a GDS ID number. :param remove_unknown: Remove spots with sample profiles that include unknown values. They are removed if the proportion of samples with unknown values is above the threshold set by ``remove_unknown``. If None, nothing is removed. """ self.gds_name = gds_name self.filename = serverfiles.localpath(DOMAIN, self.gds_name + '.soft.gz') gds_ensure_downloaded(self.gds_name) self.spot2gene = {} self.gene2spots = {} self.info = None self.gds_data = None self.parse_file(remove_unknown=remove_unknown) taxid = taxonomy.search(self.info["sample_organism"], exact=True) self.info["taxid"] = taxid[0] if len(taxid) == 1 else None self.genes = sorted(self.gene2spots.keys()) self.spots = sorted(self.spot2gene.keys()) self.info["gene_count"] = len(self.genes)
def get_data(gds_id, report_genes, transpose, sample_type, title): gds_ensure_downloaded(gds_id, progress) gds = GDS(gds_id) data = gds.get_data(report_genes=report_genes, transpose=transpose, sample_type=sample_type) data.name = title return data
def test_gds_data(self): # test url self.assertIsNotNone(gds_download_url(self.test_sample)) # file not in cache self.assertFalse(gds_is_cached(self.test_sample)) # download gds from serverfiles try: makedirs(serverfiles.localpath(DOMAIN)) except OSError: if path.exists(serverfiles.localpath(DOMAIN)): pass else: # There was an error on creation, so make sure we know about it raise gds_download(self.test_sample) # file in cache self.assertIsNone(gds_ensure_downloaded(self.test_sample)) self.assertTrue(gds_is_cached(self.test_sample)) gds = GDS(self.test_sample) self.assertIsNotNone(gds.info) self.assertEqual(gds.info['gene_count'], 9561) self.assertEqual(len(gds.info['samples']), 4) self.assertEqual(len(gds.info['subsets']), 2) self.assertEqual(gds.info['taxid'], self.test_organism) self.assertIsInstance(gds.get_data(), Table) self.assertIsInstance(gds.get_data(transpose=True), Table)