def get_available_organisms(self): available_organism = sorted( ((tax_id, taxonomy.name(tax_id)) for tax_id in taxonomy.common_taxids()), key=lambda x: x[1] ) self.organisms = [tax_id[0] for tax_id in available_organism] self.organism_select_combobox.addItems([tax_id[1] for tax_id in available_organism])
def upload_genesets(): """ Builds the default gene sets and """ genesetsfn = [ go_gene_sets, kegg_gene_sets, # omim_gene_sets, # stop supporting OMIM. Did not update files since 2011 cytoband_gene_sets, reactome_gene_sets, dicty_mutant_gene_sets ] organisms = taxonomy.common_taxids() for fn in genesetsfn: for org in organisms: try: # print("Uploading ORG {} {}".format(org, fn)) try: genesets = fn(org).split_by_hierarchy() except AttributeError as e: # print(e) # genesets = fn().split_by_hierarchy() print(e) continue for gs in genesets: # print("registering {}".format(str(gs.common_hierarchy()))) register_serverfiles(gs) # server files register(gs) except taxonomy.UnknownSpeciesIdentifier: print("Organism ontology not available %s" % org) except GeneSetException: print("Empty gene sets. %s" % org)
def listAvailable(): taxids = taxonomy.common_taxids() essential = [(taxonomy.name(taxid), 'gene_association.{}'.format(taxid)) for taxid in taxids if (DOMAIN, 'gene_association.{}'.format(taxid) ) in serverfiles.ServerFiles().listfiles(DOMAIN)] return dict(essential)
def test_uncommon_taxonomy(self): self.assertTrue(self.dog not in taxonomy.common_taxids()) self.assertEqual(taxonomy.name(self.dog), 'Canis lupus familiaris') # not supported yet. self.assertIsNone( taxonomy.species_name_to_taxid('Canis lupus familiaris')) self.assertFalse(len(taxonomy.shortname(self.dog)))
def _get_available_organisms(self): available_organism = sorted([(tax_id, taxonomy.name(tax_id)) for tax_id in taxonomy.common_taxids()], key=lambda x: x[1]) self.organisms = [tax_id[0] for tax_id in available_organism] self.organismComboBox.addItems( [tax_id[1] for tax_id in available_organism])
def create_model(self): allkeys = set(self.allinfo_local) if self.allinfo_remote is not None: allkeys = allkeys | set(self.allinfo_remote) allkeys = sorted(allkeys) model = QStandardItemModel(self) model.setHorizontalHeaderLabels(self._header_labels) current_index = -1 for i, file_path in enumerate(allkeys): data_info = self._parse_info(file_path) row = [] for info_tag, header_setting in self.HEADER_SCHEMA: item = QStandardItem() try: data = data_info.__getattribute__(info_tag) except AttributeError: # unknown tag in JSON data = '' # first column indicating cached data sets if info_tag == 'islocal': item.setData(' ' if data else '', Qt.DisplayRole) item.setData(data_info, Qt.UserRole) else: # parse taxid to common name if info_tag == 'taxid' and data in common_taxids(): data = shortname(data)[0].title() if info_tag == 'tags': data = ', '.join(data) if data else '' item.setData(data, Qt.DisplayRole) # set icon to Target column if info_tag == 'target' and data: item.setIcon( Orange.widgets.data.owdatasets.variable_icon(data)) row.append(item) model.appendRow(row) if os.path.join(*file_path) == self.selected_id: current_index = i return model, current_index
def test_common_taxonomy(self): self.assertGreater(len(taxonomy.common_taxids()), 0) self.assertEqual(taxonomy.name(self.human), 'H**o sapiens') self.assertEqual(taxonomy.name(self.dicty), 'Dictyostelium discoideum') self.assertEqual(taxonomy.species_name_to_taxid('H**o sapiens'), self.human) self.assertEqual( taxonomy.species_name_to_taxid('Dictyostelium discoideum'), self.dicty) self.assertGreater(len(taxonomy.shortname(self.human)), 0) self.assertGreater(len(taxonomy.shortname(self.dicty)), 0)
mapper[MAP_LOCUS][gene.locus_tag].append(gene) mapper[MAP_GENE_IDS][gene.gene_id].append(gene) for gene_synonym in gene.synonyms: mapper[MAP_SYNONYMS][gene_synonym].append(gene) for source_id in gene.sources.values(): mapper[MAP_SOURCES][source_id].append(gene) print("Creating gene name mapper ...") con = sqlite3.connect(db_path, timeout=15) cursor = con.cursor() for taxonomy_id in common_taxids(): g_db = GeneInfoDB() gene_mapper = { MAP_GENE_IDS: defaultdict(list), MAP_SOURCES: defaultdict(list), MAP_SYMBOLS: defaultdict(list), MAP_SYNONYMS: defaultdict(list), MAP_LOCUS: defaultdict(list) } for record in g_db.select_gene_matcher_data(taxonomy_id): parse_gene_record(taxonomy_id, gene_mapper, record) with open(os.path.join(domain_path, MATCHER_FILENAME.format(taxonomy_id)), 'wb') as file: pickle.dump(gene_mapper, file, protocol=pickle.HIGHEST_PROTOCOL)
class FileUploadHelper(QDialog): # settings kegg_domain = 'KEGG' supported_domains = OrderedDict({ 'Gene Ontology': gene_ontology_domain, 'Gene Sets': gene_sets_domain }) supported_organisms = [ common_taxid_to_name(tax_id) for tax_id in common_taxids() ] hierarchies = { 'GO - Biological Process': ('GO', 'biological_process'), 'GO - Molecular Function': ('GO', 'molecular_function'), 'GO - Cellular Component': ('GO', 'cellular_component'), 'KEGG - Pathways': ('KEGG', 'pathways'), 'KEGG - Orthologs': ('KEGG', 'orthologs') } def __init__(self, parent=None): super(FileUploadHelper, self).__init__( parent, Qt.Window | Qt.WindowTitleHint | Qt.CustomizeWindowHint | Qt.WindowCloseButtonHint | Qt.WindowMaximizeButtonHint) self.setAttribute(Qt.WA_DeleteOnClose) self.setWindowTitle('Add new file') self.info_state = INFO_FILE_SCHEMA self.layout = QVBoxLayout(self) # domain selection combobox self.domain_selection = QComboBox() self.domain_selection.addItems(self.supported_domains.keys()) self.domain_selection.currentIndexChanged.connect( self.__on_domain_selection) self.__create_selection_row('Domain: ', self.domain_selection) # domain selection combobox self.hierarchy_selection = QComboBox() self.hierarchy_selection.addItems(self.hierarchies.keys()) self.layout.addWidget(self.hierarchy_selection, alignment=Qt.AlignVCenter) self.__on_domain_selection() # select organism self.organism_selection = QComboBox() self.organism_selection.addItems(self.supported_organisms) self.__create_selection_row('Organism: ', self.organism_selection) # title self.line_edit_title = QLineEdit() self.__create_selection_row('Title: ', self.line_edit_title) # tags self.line_edit_tags = QLineEdit() self.__create_selection_row('Tags (comma-separated): ', self.line_edit_tags) # file selector self.file_info = QLabel() self.file_select_btn = QPushButton('Select File', self) self.file_select_btn.clicked.connect(self.__handle_file_selector) self.__create_selection_row(' ', self.file_select_btn) # add file info section self.layout.addWidget(self.file_info, alignment=Qt.AlignCenter) self.layout.addStretch(1) # Ok and Cancel buttons self.buttons = QDialogButtonBox( QDialogButtonBox.Ok | QDialogButtonBox.Cancel, Qt.Horizontal, self) self.layout.addWidget(self.buttons, alignment=Qt.AlignJustify) self.buttons.accepted.connect(self.__accept) self.buttons.rejected.connect(self.__close) # path to a selected file self.file_path = None def __on_domain_selection(self): selected = self.__get_selected_domain() == gene_sets_domain self.hierarchy_selection.setVisible(selected) def __get_selected_domain(self): domain_label = list(self.supported_domains.keys())[ self.domain_selection.currentIndex()] return self.supported_domains[domain_label] def __get_selected_hier(self): hier_label = list( self.hierarchies.keys())[self.hierarchy_selection.currentIndex()] return self.hierarchies[hier_label] def __create_selection_row(self, label, widget): self.layout.addWidget(QLabel(label), alignment=Qt.AlignLeft) self.layout.addWidget(widget, alignment=Qt.AlignVCenter) def __accept(self): if self.file_path: self.info_state = self.__parse_selection() self.__move_to_serverfiles_folder(self.file_path) self.parent().initialize_files_view() self.close() def __close(self): self.close() def closeEvent(self, event): # clean-up self.parent()._dialog = None def __filename(self, domain, organism): """ Create filename based od domain name and organism. """ if domain in self.supported_domains.values( ) and domain == gene_ontology_domain and organism: return FILENAME_ANNOTATION.format(organism) elif domain in self.supported_domains.values( ) and domain == gene_sets_domain and organism: return filename((self.__get_selected_hier()), organism) def __parse_selection(self): try: domain = self.__get_selected_domain() organism = taxname_to_taxid(self.supported_organisms[ self.organism_selection.currentIndex()]) except KeyError as e: raise e return { 'domain': domain, 'organism': organism, 'filename': self.__filename(domain, organism), 'title': self.line_edit_title.text(), 'tags': self.line_edit_tags.text().split(','), 'source': SOURCE_USER } def __move_to_serverfiles_folder(self, selected_file_path): domain_path = serverfiles.localpath(self.info_state['domain']) file_path = os.path.join(domain_path, self.info_state['filename']) create_folder(domain_path) try: copyfile(selected_file_path, file_path) except IOError as e: # TODO: handle error properly raise e # if copy successful create .info file create_info_file(file_path, **self.info_state) def __handle_file_selector(self): self.file_path = QFileDialog.getOpenFileName(self, 'Open File')[0] self.file_info.setText('Selected File: {}'.format( os.path.basename(self.file_path)))
from orangecontrib.bioinformatics.ncbi.taxonomy.utils import Taxonomy # columns indexes # ftp://ftp.ncbi.nlm.nih.gov/gene/README under "gene_info" section tax_id, gene_id, symbol, synonyms, db_refs, description = 0, 1, 2, 4, 5, 8 locus_tag, chromosome, map_location, type_of_gene, modification_date = 3, 6, 7, 9, 14 symbol_from_nomenclature_authority, full_name_from_nomenclature_authority = 10, 11 nomenclature_status, other_designations = 12, 13 domain_path = sf_local.localpath(DOMAIN) temp_path = os.path.join(domain_path, sf_temp) file_path = os.path.join(domain_path, FILENAME) create_folder(domain_path) create_folder(temp_path) parent_tax_ids = common_taxids() # we must include all strains of organism. Genes refer to specific strain and not to parent organism tax_ids = [] tax_obj = Taxonomy() for parent_id in parent_tax_ids: strains = tax_obj.get_all_strains(parent_id) tax_ids.append(parent_id) # print(parent_id, len(strains)) if strains: [tax_ids.append(strain_id) for strain_id in strains] init_table = """ CREATE TABLE "gene_info" ( tax_id INTEGER NOT NULL, gene_id INTEGER NOT NULL UNIQUE,
# read the information from the local file with open(localfile, 'rb') as f: gds_info, excluded = pickle.load(f, encoding='latin1') f.close() except FileNotFoundError as e: print('{} file on the server not found!'.format(GDS_INFO)) force_update = True # if needed to refresh the data base if force_update: gds_info, excluded = ({}, {}) # list of common organisms may have changed, rescan excluded list excluded = dict([(id, taxid) for id, taxid in excluded.items() if taxid not in taxonomy.common_taxids()]) excluded.update([(id, info["taxid"]) for id, info in gds_info.items() if info["taxid"] not in taxonomy.common_taxids()]) gds_info = dict([(id, info) for id, info in gds_info.items() if info["taxid"] in taxonomy.common_taxids()]) # get the list of GDS files from NCBI directory print("Retrieving ftp directory ...") ftp = ftplib.FTP(FTP_NCBI) ftp.login() ftp.cwd(NCBI_DIR) dirlist = [] ftp.dir(dirlist.append) m = re.compile("GDS[0-9]*") gds_names = [m.search(d).group(0) for d in dirlist if m.search(d)]
'tax_id', 'GeneID', 'GO_ID', 'Evidence', 'Qualifier', 'GO_term' 'PubMed', 'Category' ] data = gene2go.get(species, None) if data is not None: with open(f'data/go/{species}.tab', 'w') as fp: csv_writer = csv.writer(fp, delimiter='\t') csv_writer.writerow(header) csv_writer.writerows(data) def gene_ontology(file_path: str) -> None: copy2(file_path, f'data/go/gene_ontology.obo') if __name__ == "__main__": taxonomy_db = Taxonomy() supported_taxonomies = [[tax] + taxonomy_db.get_all_strains(tax) for tax in common_taxids()] to_species = { tax: taxonomy_db.get_species(tax) for strains in supported_taxonomies for tax in strains } gene2go = load_gene2go(sys.argv[1]) for tax in common_taxids(): gene_annotations(tax) gene_ontology(sys.argv[2])
def setUp(self): self.common_ids = taxonomy.common_taxids() self.organisms = [(taxonomy.name(tax_id), tax_id) for tax_id in self.common_ids] self.taxon = taxonomy.Taxonomy()
with bz2.BZ2File(os.path.join(temp_path, FILENAME_ONTOLOGY), mode='w', compresslevel=9) as f_compressed: shutil.copyfileobj(open(os.path.join(domain_path, FILENAME_ONTOLOGY), 'rb'), f_compressed) create_info_file(os.path.join(temp_path, FILENAME_ONTOLOGY), domain=DOMAIN, filename=FILENAME_ONTOLOGY, source=SOURCE_SERVER, title=ONTOLOGY_TITLE, tags=ONTOLOGY_TAGS, uncompressed=db_size, compression='bz2') # GENE ANNOTATIONS tax_ids = common_taxids() taxonomy = Taxonomy() store_lines_by_taxid = defaultdict(list) stream = urlopen(FTP_URL_ANNOTATIONS, timeout=30) with open(os.path.join(domain_path, FTP_FILE_ANNOTATIONS), 'wb') as f: shutil.copyfileobj(stream, f) with gzip.open(os.path.join(domain_path, FTP_FILE_ANNOTATIONS), 'rb') as gene2go: header = gene2go.readline() for line in gene2go: split_line = line.decode().split('\t')
name=str(cell_type), genes=set([str(gene) for gene in genes if gene != '?']), hierarchy=('Marker Genes', file_name_to_hier[file_name]), organism=tax_id, link='') gene_sets.append(gs) for gs_group in GeneSets(gene_sets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}') if __name__ == "__main__": for common_tax_id in taxonomy.common_taxids(): reactome_gene_sets(common_tax_id) cytoband_gene_sets(common_tax_id) dicty_mutant_gene_sets(common_tax_id) try: kegg_gene_sets(common_tax_id) except taxonomy.utils.UnknownSpeciesIdentifier as e: # KEGG organism code not found pass try: go_gene_sets(common_tax_id) except FileNotFoundError as e: # Organism is not supported in Gene Ontology module pass