def prepare_go_serverfiles(): Path('serverfiles/go/').mkdir(parents=True, exist_ok=True) for file in list_files('data/go/'): if file.name == 'gene_ontology.obo': title = 'Gene Ontology (GO)' tags = ['gene', 'ontology', 'GO'] description = 'Basic version of the GO, filtered such that the graph is guaranteed to be ' \ 'acyclic and annotations can be propagated up the graph.' else: tax_id = file.name.replace('.tab', '') species_name = common_taxid_to_name( tax_id) # taxonomy_db.get_entry(tax_id).name title = f'GO Annotations for {species_name}' tags = [ 'gene', 'annotation', 'ontology', 'GO', tax_id, species_name ] description = 'The gene association file ingested from GO Consortium members.' source_path = str(file.absolute()) dest_path = f'serverfiles/go/{file.name}' shutil.copy2(source_path, dest_path) # preparse serverfiles server_file = ServerFile(domain='go', file_path=dest_path, title=title, description=description, tags=tags) server_file.to_server_format()
def gene_info_to_dict(gene_data: tuple): homology_group = homologs.get(str(gene_data[gene_id]), [None])[0] homolog_genes = { tax: gid for (_, tax, gid) in homologs_by_group.get(homology_group, []) if homology_group and tax != gene_data[tax_id] } return { 'species': common_taxid_to_name(to_species[gene_data[tax_id]]), 'tax_id': gene_data[tax_id], 'gene_id': gene_data[gene_id], 'symbol': gene_data[symbol], 'synonyms': pipe_delimited_to_list(gene_data[synonyms]), 'db_refs': parse_db_refs(gene_data[db_refs]), 'description': gene_data[description] if gene_data[description] != '-' else None, 'locus_tag': gene_data[locus_tag] if gene_data[locus_tag] != '-' else None, 'chromosome': gene_data[chromosome] if gene_data[chromosome] != '-' else None, 'map_location': gene_data[map_location] if gene_data[map_location] != '-' else None, 'type_of_gene': gene_data[type_of_gene] if gene_data[type_of_gene] != '-' else None, 'symbol_from_nomenclature_authority': gene_data[symbol_from_nomenclature_authority] if gene_data[symbol_from_nomenclature_authority] != '-' else None, 'full_name_from_nomenclature_authority': gene_data[full_name_from_nomenclature_authority] if gene_data[full_name_from_nomenclature_authority] != '-' else None, 'nomenclature_status': gene_data[nomenclature_status] if gene_data[nomenclature_status] != '-' else None, 'other_designations': pipe_delimited_to_list(gene_data[other_designations]), 'modification_date': gene_data[modification_date], 'homology_group_id': homology_group, 'homologs': homolog_genes }
def prepare_gene_sets_serverfiles(): Path('serverfiles/gene_sets/').mkdir(parents=True, exist_ok=True) for file in list_files('data/gene_sets/'): source_path = str(file.absolute()) dest_path = f'serverfiles/gene_sets/{file.name}' shutil.copy2(source_path, dest_path) hierarchy, tax_id = filename_parse(file.name) title = f"Gene sets: {', '.join(hierarchy)} ({common_taxid_to_name(tax_id)})" tags = list(hierarchy) + ['gene sets', common_taxid_to_name(tax_id) ] + shortname(tax_id) server_file = ServerFile(domain='gene_sets', file_path=dest_path, title=title, description='', tags=tags) server_file.to_server_format()
uncompressed_size = os.stat( os.path.join(domain_path, MATCHER_FILENAME.format(taxonomy_id))).st_size with bz2.BZ2File(os.path.join(temp_path, MATCHER_FILENAME.format(taxonomy_id)), mode='w', compresslevel=9) as f: shutil.copyfileobj( open( os.path.join(domain_path, MATCHER_FILENAME.format(taxonomy_id)), "rb"), f) create_info_file(os.path.join(temp_path, MATCHER_FILENAME.format(taxonomy_id)), title=MATCHER_TITLE + ' for ' + common_taxid_to_name(taxonomy_id), tags=MATCHER_TAGS + [taxonomy_id], uncompressed=uncompressed_size, compression='bz2') con.close() helper = SyncHelper(DOMAIN, GeneInfo) # sync files with remote server helper.run_tests() helper.sync_files() helper.remove_update_folder()
class FileUploadHelper(QDialog): # settings kegg_domain = 'KEGG' supported_domains = OrderedDict({ 'Gene Ontology': gene_ontology_domain, 'Gene Sets': gene_sets_domain }) supported_organisms = [ common_taxid_to_name(tax_id) for tax_id in common_taxids() ] hierarchies = { 'GO - Biological Process': ('GO', 'biological_process'), 'GO - Molecular Function': ('GO', 'molecular_function'), 'GO - Cellular Component': ('GO', 'cellular_component'), 'KEGG - Pathways': ('KEGG', 'pathways'), 'KEGG - Orthologs': ('KEGG', 'orthologs') } def __init__(self, parent=None): super(FileUploadHelper, self).__init__( parent, Qt.Window | Qt.WindowTitleHint | Qt.CustomizeWindowHint | Qt.WindowCloseButtonHint | Qt.WindowMaximizeButtonHint) self.setAttribute(Qt.WA_DeleteOnClose) self.setWindowTitle('Add new file') self.info_state = INFO_FILE_SCHEMA self.layout = QVBoxLayout(self) # domain selection combobox self.domain_selection = QComboBox() self.domain_selection.addItems(self.supported_domains.keys()) self.domain_selection.currentIndexChanged.connect( self.__on_domain_selection) self.__create_selection_row('Domain: ', self.domain_selection) # domain selection combobox self.hierarchy_selection = QComboBox() self.hierarchy_selection.addItems(self.hierarchies.keys()) self.layout.addWidget(self.hierarchy_selection, alignment=Qt.AlignVCenter) self.__on_domain_selection() # select organism self.organism_selection = QComboBox() self.organism_selection.addItems(self.supported_organisms) self.__create_selection_row('Organism: ', self.organism_selection) # title self.line_edit_title = QLineEdit() self.__create_selection_row('Title: ', self.line_edit_title) # tags self.line_edit_tags = QLineEdit() self.__create_selection_row('Tags (comma-separated): ', self.line_edit_tags) # file selector self.file_info = QLabel() self.file_select_btn = QPushButton('Select File', self) self.file_select_btn.clicked.connect(self.__handle_file_selector) self.__create_selection_row(' ', self.file_select_btn) # add file info section self.layout.addWidget(self.file_info, alignment=Qt.AlignCenter) self.layout.addStretch(1) # Ok and Cancel buttons self.buttons = QDialogButtonBox( QDialogButtonBox.Ok | QDialogButtonBox.Cancel, Qt.Horizontal, self) self.layout.addWidget(self.buttons, alignment=Qt.AlignJustify) self.buttons.accepted.connect(self.__accept) self.buttons.rejected.connect(self.__close) # path to a selected file self.file_path = None def __on_domain_selection(self): selected = self.__get_selected_domain() == gene_sets_domain self.hierarchy_selection.setVisible(selected) def __get_selected_domain(self): domain_label = list(self.supported_domains.keys())[ self.domain_selection.currentIndex()] return self.supported_domains[domain_label] def __get_selected_hier(self): hier_label = list( self.hierarchies.keys())[self.hierarchy_selection.currentIndex()] return self.hierarchies[hier_label] def __create_selection_row(self, label, widget): self.layout.addWidget(QLabel(label), alignment=Qt.AlignLeft) self.layout.addWidget(widget, alignment=Qt.AlignVCenter) def __accept(self): if self.file_path: self.info_state = self.__parse_selection() self.__move_to_serverfiles_folder(self.file_path) self.parent().initialize_files_view() self.close() def __close(self): self.close() def closeEvent(self, event): # clean-up self.parent()._dialog = None def __filename(self, domain, organism): """ Create filename based od domain name and organism. """ if domain in self.supported_domains.values( ) and domain == gene_ontology_domain and organism: return FILENAME_ANNOTATION.format(organism) elif domain in self.supported_domains.values( ) and domain == gene_sets_domain and organism: return filename((self.__get_selected_hier()), organism) def __parse_selection(self): try: domain = self.__get_selected_domain() organism = taxname_to_taxid(self.supported_organisms[ self.organism_selection.currentIndex()]) except KeyError as e: raise e return { 'domain': domain, 'organism': organism, 'filename': self.__filename(domain, organism), 'title': self.line_edit_title.text(), 'tags': self.line_edit_tags.text().split(','), 'source': SOURCE_USER } def __move_to_serverfiles_folder(self, selected_file_path): domain_path = serverfiles.localpath(self.info_state['domain']) file_path = os.path.join(domain_path, self.info_state['filename']) create_folder(domain_path) try: copyfile(selected_file_path, file_path) except IOError as e: # TODO: handle error properly raise e # if copy successful create .info file create_info_file(file_path, **self.info_state) def __handle_file_selector(self): self.file_path = QFileDialog.getOpenFileName(self, 'Open File')[0] self.file_info.setText('Selected File: {}'.format( os.path.basename(self.file_path)))
def __init__(self, parent=None): super().__init__(self, parent) self.input_data = None self.ref_data = None self.ontology = None self.annotations = None self.loaded_annotation_code = None self.treeStructRootKey = None self.probFunctions = [statistics.Binomial(), statistics.Hypergeometric()] self.selectedTerms = [] self.selectionChanging = 0 self.__state = State.Ready self.__scheduletimer = QTimer(self, singleShot=True) self.__scheduletimer.timeout.connect(self.__update) ############# # GUI ############# self.tabs = gui.tabWidget(self.controlArea) # Input tab self.inputTab = gui.createTabPage(self.tabs, "Input") box = gui.widgetBox(self.inputTab, "Info") self.infoLabel = gui.widgetLabel(box, "No data on input\n") gui.button(box, self, "Ontology/Annotation Info", callback=self.ShowInfo, tooltip="Show information on loaded ontology and annotations") self.referenceRadioBox = gui.radioButtonsInBox( self.inputTab, self, "useReferenceDataset", ["Entire genome", "Reference set (input)"], tooltips=["Use entire genome for reference", "Use genes from Referece Examples input signal as reference"], box="Reference", callback=self.__invalidate) self.referenceRadioBox.buttons[1].setDisabled(True) gui.radioButtonsInBox( self.inputTab, self, "aspectIndex", ["Biological process", "Cellular component", "Molecular function"], box="Aspect", callback=self.__invalidate) # Filter tab self.filterTab = gui.createTabPage(self.tabs, "Filter") box = gui.widgetBox(self.filterTab, "Filter GO Term Nodes") gui.checkBox(box, self, "filterByNumOfInstances", "Genes", callback=self.FilterAndDisplayGraph, tooltip="Filter by number of input genes mapped to a term") ibox = gui.indentedBox(box) gui.spin(ibox, self, 'minNumOfInstances', 1, 100, step=1, label='#:', labelWidth=15, callback=self.FilterAndDisplayGraph, callbackOnReturn=True, tooltip="Min. number of input genes mapped to a term") gui.checkBox(box, self, "filterByPValue_nofdr", "p-value", callback=self.FilterAndDisplayGraph, tooltip="Filter by term p-value") gui.doubleSpin(gui.indentedBox(box), self, 'maxPValue_nofdr', 1e-8, 1, step=1e-8, label='p:', labelWidth=15, callback=self.FilterAndDisplayGraph, callbackOnReturn=True, tooltip="Max term p-value") # use filterByPValue for FDR, as it was the default in prior versions gui.checkBox(box, self, "filterByPValue", "FDR", callback=self.FilterAndDisplayGraph, tooltip="Filter by term FDR") gui.doubleSpin(gui.indentedBox(box), self, 'maxPValue', 1e-8, 1, step=1e-8, label='p:', labelWidth=15, callback=self.FilterAndDisplayGraph, callbackOnReturn=True, tooltip="Max term p-value") box = gui.widgetBox(box, "Significance test") gui.radioButtonsInBox(box, self, "probFunc", ["Binomial", "Hypergeometric"], tooltips=["Use binomial distribution test", "Use hypergeometric distribution test"], callback=self.__invalidate) # TODO: only update the p values box = gui.widgetBox(self.filterTab, "Evidence codes in annotation", addSpace=True) self.evidenceCheckBoxDict = {} for etype in go.evidenceTypesOrdered: ecb = QCheckBox( etype, toolTip=go.evidenceTypes[etype], checked=self.useEvidenceType[etype]) ecb.toggled.connect(self.__on_evidenceChanged) box.layout().addWidget(ecb) self.evidenceCheckBoxDict[etype] = ecb # Select tab self.selectTab = gui.createTabPage(self.tabs, "Select") box = gui.radioButtonsInBox( self.selectTab, self, "selectionDirectAnnotation", ["Directly or Indirectly", "Directly"], box="Annotated genes", callback=self.ExampleSelection) box = gui.widgetBox(self.selectTab, "Output", addSpace=True) gui.radioButtonsInBox( box, self, "selectionDisjoint", btnLabels=["All selected genes", "Term-specific genes", "Common term genes"], tooltips=["Outputs genes annotated to all selected GO terms", "Outputs genes that appear in only one of selected GO terms", "Outputs genes common to all selected GO terms"], callback=self.ExampleSelection) # ListView for DAG, and table for significant GOIDs self.DAGcolumns = ['GO term', 'Cluster', 'Reference', 'p-value', 'FDR', 'Genes', 'Enrichment'] self.splitter = QSplitter(Qt.Vertical, self.mainArea) self.mainArea.layout().addWidget(self.splitter) # list view self.listView = GOTreeWidget(self.splitter) self.listView.setSelectionMode(QTreeView.ExtendedSelection) self.listView.setAllColumnsShowFocus(1) self.listView.setColumnCount(len(self.DAGcolumns)) self.listView.setHeaderLabels(self.DAGcolumns) self.listView.header().setSectionsClickable(True) self.listView.header().setSortIndicatorShown(True) self.listView.header().setSortIndicator(self.DAGcolumns.index('p-value'), Qt.AscendingOrder) self.listView.setSortingEnabled(True) self.listView.setItemDelegateForColumn( 6, EnrichmentColumnItemDelegate(self)) self.listView.setRootIsDecorated(True) self.listView.itemSelectionChanged.connect(self.ViewSelectionChanged) # table of significant GO terms self.sigTerms = QTreeWidget(self.splitter) self.sigTerms.setColumnCount(len(self.DAGcolumns)) self.sigTerms.setHeaderLabels(self.DAGcolumns) self.sigTerms.setSortingEnabled(True) self.sigTerms.setSelectionMode(QTreeView.ExtendedSelection) self.sigTerms.header().setSortIndicator(self.DAGcolumns.index('p-value'), Qt.AscendingOrder) self.sigTerms.setItemDelegateForColumn( 6, EnrichmentColumnItemDelegate(self)) self.sigTerms.itemSelectionChanged.connect(self.TableSelectionChanged) self.sigTableTermsSorted = [] self.graph = {} self.originalGraph = None self.inputTab.layout().addStretch(1) self.filterTab.layout().addStretch(1) self.selectTab.layout().addStretch(1) class AnnotationSlot(SimpleNamespace): taxid = ... # type: str name = ... # type: str filename = ... # type:str @staticmethod def parse_tax_id(f_name): return f_name.split('.')[1] try: remote_files = serverfiles.ServerFiles().listfiles(DOMAIN) except (ConnectTimeout, RequestException, ConnectionError): # TODO: Warn user about failed connection to the remote server remote_files = [] self.available_annotations = [ AnnotationSlot( taxid=AnnotationSlot.parse_tax_id(annotation_file), name=taxonomy.common_taxid_to_name(AnnotationSlot.parse_tax_id(annotation_file)), filename=FILENAME_ANNOTATION.format(AnnotationSlot.parse_tax_id(annotation_file)) ) for _, annotation_file in set(remote_files + serverfiles.listfiles(DOMAIN)) if annotation_file != FILENAME_ONTOLOGY ] self._executor = ThreadExecutor()
for record in g_db.select_gene_matcher_data(taxonomy_id): parse_gene_record(taxonomy_id, gene_mapper, record) with open(os.path.join(domain_path, MATCHER_FILENAME.format(taxonomy_id)), 'wb') as file: pickle.dump(gene_mapper, file, protocol=pickle.HIGHEST_PROTOCOL) uncompressed_size = os.stat(os.path.join(domain_path, MATCHER_FILENAME.format(taxonomy_id))).st_size with bz2.BZ2File(os.path.join(temp_path, MATCHER_FILENAME.format(taxonomy_id)), mode='w', compresslevel=9) as f: shutil.copyfileobj(open(os.path.join(domain_path, MATCHER_FILENAME.format(taxonomy_id)), "rb"), f) create_info_file(os.path.join(temp_path, MATCHER_FILENAME.format(taxonomy_id)), domain=DOMAIN, filename=MATCHER_FILENAME.format(taxonomy_id), source=SOURCE_SERVER, title=MATCHER_TITLE + ' for ' + common_taxid_to_name(taxonomy_id), tags=MATCHER_TAGS + [taxonomy_id], uncompressed=uncompressed_size, compression='bz2') con.close() helper = SyncHelper(DOMAIN, GeneInfo) # sync files with remote server helper.run_tests() helper.sync_files() helper.remove_update_folder()
def set_data(self, data: Table) -> None: self.Warning.clear() self.data = data if self.data: if TableAnnotation.gene_as_attr_name not in self.data.attributes: self.Warning.mising_gene_as_attribute_name() self.data = None return if self.data.attributes[TableAnnotation.gene_as_attr_name]: if TableAnnotation.gene_id_attribute not in self.data.attributes: self.Warning.mising_gene_id_attribute() self.data = None return else: if TableAnnotation.tax_id not in self.data.attributes: self.Warning.missing_tax_id() self.data = None return if TableAnnotation.gene_id_column not in self.data.attributes: self.Warning.mising_gene_as_attribute_name() self.data = None return if self.data.attributes[ TableAnnotation. gene_id_column] not in self.data.domain: self.Warning.missing_gene_id() self.data = None return else: self.info.set_input_summary("0") self.info.set_output_summary("0") self.info_gene.clear() self.info_gene_type.setText("No data on input.") self.Outputs.genes.send(None) return self.source_tax = data.attributes[TableAnnotation.tax_id] taxonomy = common_taxid_to_name(self.source_tax) self.target_organism.clear() self.target_organism.addItems([ tax_name for tax_name in self.taxonomy_names if tax_name != taxonomy ]) if taxonomy == self.selected_organism: self.combo_box_id = -1 self.selected_organism = self.taxonomy_names[0] self.target_tax = species_name_to_taxid(self.selected_organism) else: try: self.combo_box_id = self.taxonomy_names.index( self.selected_organism) except ValueError: self.combo_box_id = -1 if self.combo_box_id != -1: self.target_organism.setCurrentIndex(self.combo_box_id) self.selected_organism = self.taxonomy_names[self.combo_box_id] self.target_tax = species_name_to_taxid(self.selected_organism) else: self.target_organism.setCurrentIndex(0) self.selected_organism = self.taxonomy_names[0] self.target_tax = species_name_to_taxid(self.selected_organism) self.info_gene_type.setText(f"Organism: {taxonomy}") data_len = (len(data.domain.attributes) if self.data.attributes[TableAnnotation.gene_as_attr_name] else len(data)) self.info_gene.setText(f"Number of genes: {data_len}") self.info.set_input_summary(f"{data_len}") self.commit()
for line in gene2go: split_line = line.decode().split('\t') if split_line[0] in tax_ids: store_lines_by_taxid[split_line[0]].append(line) #else: #parent = taxonomy.parent(split_line[0]) #if parent in tax_ids: #store_lines_by_taxid[parent].append(line) for org, lines in store_lines_by_taxid.items(): filename = FILENAME_ANNOTATION.format(org) FILE_PATH = os.path.join(domain_path, filename) TITLE = "GO Annotations for " + common_taxid_to_name(org) TAGS = ["gene", "annotation", "ontology", "GO", org] with open(FILE_PATH, 'wb') as f: f.write(header) f.writelines(lines) db_size = os.stat(FILE_PATH).st_size # store uncompressed database size with bz2.BZ2File(os.path.join(temp_path, filename), mode='w', compresslevel=9) as f_compressed: shutil.copyfileobj(open(os.path.join(domain_path, filename), 'rb'), f_compressed) create_info_file(os.path.join(temp_path, filename), domain=DOMAIN, filename=filename, source=SOURCE_SERVER,