def read_file(self): oboParser = OboParser() with FileReader.open_file(self.in_path) as file: df = oboParser.obo_to_df(file, self.quadruple_list) df_cols = df.columns defined_cols = [x[3] for x in self.quadruple_list] if len(df_cols) != len(defined_cols): no_occurences = [x for x in defined_cols if x not in df_cols] info_string = "Reader %s should parse %s but there are no occurrences in file %s. " % ( str(self.readerType), str(no_occurences), self.in_path, ) if globalConfig.INTERACTIVE_MODE: ask_continue_string = info_string + "Continue if you do not need these edges in your graph" if globConst.GUI_MODE: from openbiolink.gui import gui gui.askForExit(ask_continue_string) else: Cli.ask_for_exit(ask_continue_string) for col in no_occurences: df[col] = np.nan else: logging.error(info_string) sys.exit(info_string) return df
def create_nodes_and_edges (self, edge_metadata, tn= None): if not os.path.isfile(edge_metadata.edges_file_path): message ='File does not exist: %s ! Edgetype %s will not be created' %(edge_metadata.edges_file_path, str(edge_metadata.edgeType)) if globalConfig.INTERACTIVE_MODE: if globConst.GUI_MODE: from openbiolink.gui import gui gui.askForExit(message) else: Cli.ask_for_exit(message) else: logging.error(message) return set(), set(), set() # --- mapping --- mapping1 = utils.db_mapping_file_to_dic(edge_metadata.mapping1_file, edge_metadata.map1_sourceindex, edge_metadata.map1_targetindex) mapping2 = utils.db_mapping_file_to_dic(edge_metadata.mapping2_file, edge_metadata.map2_sourceindex, edge_metadata.map2_targetindex) altid_mapping1 = utils.db_mapping_file_to_dic(edge_metadata.altid_mapping1_file, edge_metadata.altid_map1_sourceindex, edge_metadata.altid_map1_targetindex) altid_mapping2 = utils.db_mapping_file_to_dic(edge_metadata.altid_mapping2_file, edge_metadata.altid_map2_sourceindex, edge_metadata.altid_map2_targetindex) for mapping in [edge_metadata.mapping1_file, edge_metadata.mapping2_file, edge_metadata.altid_mapping1_file, edge_metadata.altid_mapping2_file]: if mapping is not None: infile_folder = os.path.join(globConst.WORKING_DIR, gcConst.IN_FILE_FOLDER_NAME) mapping_path = os.path.join(infile_folder, mapping) if not os.path.isfile(mapping_path): message = 'File does not exist: %s ! Edgetype %s will not be created' % ( edge_metadata.edges_file_path, str(edge_metadata.edgeType)) if globalConfig.INTERACTIVE_MODE: if globConst.GUI_MODE: from openbiolink.gui import gui gui.askForExit(message) else: Cli.ask_for_exit(message) else: logging.error(message) return set(), set(), set() # --- edges --- nodes1 = set() nodes2 = set() edges = set() ids1_no_mapping = set() ids2_no_mapping = set() ids1 = set() ids2 = set() nr_edges = 0 nr_edges_return_dir = 0 nr_edges_after_mapping = 0 nr_edges_incl_dup = 0 nr_edges_below_cutoff = 0 nr_edges_no_mapping = 0 no_cutoff_defined = edge_metadata.cutoff_num is None and edge_metadata.cutoff_txt is None with open(edge_metadata.edges_file_path, "r", encoding="utf8") as edge_content: reader = csv.reader(edge_content, delimiter = ";") for row in reader: raw_id1 = row[edge_metadata.colindex1] raw_id2 = row[edge_metadata.colindex2] if edge_metadata.colindex_qscore is not None: qscore = row[edge_metadata.colindex_qscore] else: qscore = None edge_id1 = None edge_id2 = None ids1.add(raw_id1) ids2.add(raw_id2) #apply mapping if (edge_metadata.mapping1_file is not None and raw_id1 in mapping1): edge_id1 = mapping1.get(raw_id1) elif(edge_metadata.mapping1_file is None): edge_id1 = [raw_id1] if (edge_metadata.mapping2_file is not None and raw_id2 in mapping2): edge_id2 = mapping2.get(raw_id2) elif (edge_metadata.mapping2_file is None): edge_id2 = [raw_id2] #if mapped successfully if edge_id1 is not None and edge_id2 is not None: for id1 in edge_id1: #apply alt_id mapping 1 if (edge_metadata.altid_mapping1_file is not None and id1 in altid_mapping1): id1 = altid_mapping1[id1][0] #there should only be one for id2 in edge_id2: # apply alt_id mapping 2 if (edge_metadata.altid_mapping2_file is not None and id2 in altid_mapping2): id2 = altid_mapping2[id2][0] #there should only be one #check for quality cutoff within_num_cutoff= edge_metadata.cutoff_num is not None and float(qscore) > edge_metadata.cutoff_num within_text_cutoff = edge_metadata.cutoff_txt is not None and qscore not in edge_metadata.cutoff_txt if no_cutoff_defined or within_num_cutoff or within_text_cutoff: bimeg_id1 = edge_metadata.node1_type.name + '_' + id1 bimeg_id2 = edge_metadata.node2_type.name + '_' + id2 edges.add(Edge(bimeg_id1, edge_metadata.edgeType, bimeg_id2, None, qscore)) # add an edge in the other direction when edge is undirectional and graph is directional if (not edge_metadata.is_directional) and graphProp.DIRECTED: edges.add(Edge(bimeg_id2, edge_metadata.edgeType, bimeg_id1, None, qscore)) nr_edges_incl_dup += 1 nr_edges_return_dir+=1 nodes1.add(Node(bimeg_id1, edge_metadata.node1_type)) nodes2.add(Node(bimeg_id2, edge_metadata.node2_type)) nr_edges_incl_dup += 1 else: nr_edges_below_cutoff += 1 #if not mapped successfully else: nr_edges_no_mapping += 1 if (edge_id1 is None and edge_metadata.mapping1_file is not None): ids1_no_mapping.add(raw_id1) if (edge_id2 is None and edge_metadata.mapping2_file is not None): ids2_no_mapping.add(raw_id2) nr_edges += 1 nr_edges_after_mapping = len(edges) if not no_cutoff_defined and nr_edges_below_cutoff==0: logging.warning("No edges of type %s were cut off by quality cutoff, maybe the metric has changed?" %edge_metadata.edgeType.name) if nr_edges_after_mapping==0: logging.warning("No edges of type %s are left after mapping and cutoff!"%edge_metadata.edgeType.name) # print statistics stats_dic = { 'edge_type': edge_metadata.edgeType, 'node1_type': edge_metadata.node1_type, 'node2_type': edge_metadata.node2_type, 'nr_edges': nr_edges, 'nr_edges_below_cutoff': nr_edges_below_cutoff, 'nr_edges_no_mapping': nr_edges_no_mapping, 'nr_edges_after_mapping': nr_edges_after_mapping, 'nr_edges_incl_dup': nr_edges_incl_dup, 'nr_edges_return_dir': nr_edges_return_dir, 'ids1_no_mapping': ids1_no_mapping, 'ids2_no_mapping': ids2_no_mapping, 'ids1':ids1, 'ids2': ids2 } self.print_graph_stats(stats_dic, tn) return nodes1, nodes2, edges
def init_custom_sources_bottom_up(self, use_db_metdata_classes): """helper __init__ function for custom db_metadata_classes""" self.db_file_metadata = [] # remove dbMetadata from list # make sure to use instances of classes for x in use_db_metdata_classes: if inspect.isclass(x): self.db_file_metadata.append(x()) else: self.db_file_metadata.append(x) # remove readers keep_dbType = [x.dbType for x in self.db_file_metadata] logging.info("readers removed: " + str([ x.__class__.__name__ for x in self.file_readers if x.dbType not in keep_dbType ])) self.file_readers = [ x for x in self.file_readers if x.dbType in keep_dbType ] self.dbType_reader_map = utils.cls_list_to_dic(self.file_readers, "dbType") # remove processors keep_readerType = [x.readerType for x in self.file_readers] logging.info("processors removed: %s" % (str([ x.__class__.__name__ for x in self.file_processors if x.readerType not in keep_readerType ]))) self.file_processors = [ x for x in self.file_processors if x.readerType in keep_readerType ] self.readerType_processor_map = utils.cls_list_to_dic( self.file_processors, "readerType") # remove infile metadata keep_infileType = [x.infileType for x in self.file_processors] logging.info("processors removed: " + str([ x.__class__.__name__ for x in self.infile_metadata if x.infileType not in keep_infileType ])) self.infile_metadata = [ x for x in self.infile_metadata if x.infileType in keep_infileType ] self.infileType_inMetadata_map = { x.infileType: x for x in self.infile_metadata } # remove edge metadata logging.info("edges removed: " + str([ x.__class__.__name__ for x in self.edge_metadata + self.tn_edge_metadata if x.EDGE_INMETA_CLASS.INFILE_TYPE not in keep_infileType ])) self.edge_metadata = [ x for x in self.edge_metadata if x.EDGE_INMETA_CLASS.INFILE_TYPE in keep_infileType ] self.tn_edge_metadata = [ x for x in self.tn_edge_metadata if x.EDGE_INMETA_CLASS.INFILE_TYPE in keep_infileType ] # check for deleted dependencies of mappings additional_remove_metaEdges = [] additional_remove_mapping_infileType = [] for metaEdge in self.edge_metadata + self.tn_edge_metadata: mappings = [ metaEdge.MAP1_META_CLASS, metaEdge.MAP2_META_CLASS, metaEdge.MAP1_ALT_ID_META_CLASS, metaEdge.MAP2_ALT_ID_META_CLASS, ] for mapping in mappings: if mapping is not None and mapping.INFILE_TYPE not in keep_infileType: additional_remove_metaEdges.append(metaEdge) additional_remove_mapping_infileType.append( mapping.INFILE_TYPE) if len(additional_remove_metaEdges) > 0: message = ( "\nDue to manual exclusion of DB resources, also the edges: %s\n " "will be removed due to deleted dependencies of used mappings (i.e. %s\n " "Consider manually exclude edges instead of DB resources." % ( str([ x.__class__.__name__ for x in additional_remove_metaEdges ]), str([str(x) for x in additional_remove_mapping_infileType]), )) logging.warning(message) if globConst.GUI_MODE: from openbiolink.gui import gui gui.askForExit(message) elif globConst.INTERACTIVE_MODE: Cli.ask_for_exit(message) else: sys.exit() self.edge_metadata = [ x for x in self.edge_metadata if x not in additional_remove_metaEdges ] self.tn_edge_metadata = [ x for x in self.tn_edge_metadata if x not in additional_remove_metaEdges ]