class MixedFoodCatalog(Catalog): def get_list(self): return self.__food def __str__(self): return "mixed food catalog" def __init__(self, mixed_tidy_food_file_path): super().__init__() self.mixed_tidy_food_file_path = mixed_tidy_food_file_path self.__hash_tree = None self.__food = None def initialize(self): t1 = time() logger.info('Creating mixed food catalog...') with open(self.mixed_tidy_food_file_path) as file: food = file.read().splitlines() self.__hash_tree = HashTree(food) self.__food = food t2 = time() logger.info('Done creating mixed food catalog. Total time: %.2f sec.' % (t2 - t1)) def find(self, sentence_text): food_names = self.__hash_tree.search(sentence_text) entities = EntityCollection([Entity(name, 'nogroup', FOOD_TAG) for name in food_names], FOOD_TAG) return entities
class PrebioticsCatalog(Catalog): def get_list(self): pass def __init__(self, tidy_csv_path): super().__init__() self.tidy_csv_path = tidy_csv_path self.__hash_tree = None self.__prebiotics = None def initialize(self): t1 = time() logger.info('Creating prebiotics catalog...') with open(self.tidy_csv_path) as file: prebiotics = file.read().splitlines() self.__hash_tree = HashTree(prebiotics) self.__prebiotics = prebiotics t2 = time() logger.info('Done creating prebiotics catalog. Total time: %.2f sec.' % (t2 - t1)) def find(self, sentence_text): prebiotic_names = self.__hash_tree.search(sentence_text) entities = EntityCollection([Entity(name, 'noid', PREBIOTIC_TAG) for name in prebiotic_names], PREBIOTIC_TAG) return entities
class UsdaFoodCatalog(Catalog): def get_list(self): return list(self.__food_data_frame['group'].drop_duplicates()) + list(self.__food_data_frame['name']) def __str__(self): return "usda food catalog" def __init__(self, food_file_path): super().__init__() self.food_file_path = food_file_path self.__hash_tree = None self.__food_dict = None self.__group_by_food_name = None self.__food_data_frame = None def initialize(self): t1= time() logger.info('Creating food catalog...') self.__food_data_frame = pd.read_table(self.food_file_path, sep=',') self.__food_dict = {food_group: [] for food_group in self.__food_data_frame['group'].values} for index, record in self.__food_data_frame.iterrows(): self.__food_dict[record['group']].append(record['name'].strip()) self.__group_by_food_name = {food: group for group, food_list in self.__food_dict.items() for food in food_list} self.__hash_tree = HashTree(self.__group_by_food_name.keys()) t2 = time() logger.info('Done creating food catalog. Total time: %.2f sec.' % (t2 - t1)) def find(self, sentence_text): food_names = self.__hash_tree.search(sentence_text) entities = EntityCollection([Entity(name, self.__group_by_food_name[name], FOOD_TAG) for name in food_names], FOOD_TAG) return entities
def initialize(self): t1 = time() logger.info('Creating mixed food catalog...') with open(self.mixed_tidy_food_file_path) as file: food = file.read().splitlines() self.__hash_tree = HashTree(food) self.__food = food t2 = time() logger.info('Done creating mixed food catalog. Total time: %.2f sec.' % (t2 - t1))
def initialize(self): t1 = time() logger.info('Creating prebiotics catalog...') with open(self.tidy_csv_path) as file: prebiotics = file.read().splitlines() self.__hash_tree = HashTree(prebiotics) self.__prebiotics = prebiotics t2 = time() logger.info('Done creating prebiotics catalog. Total time: %.2f sec.' % (t2 - t1))
def initialize(self): t1= time() logger.info('Creating food catalog...') self.__food_data_frame = pd.read_table(self.food_file_path, sep=',') self.__food_dict = {food_group: [] for food_group in self.__food_data_frame['group'].values} for index, record in self.__food_data_frame.iterrows(): self.__food_dict[record['group']].append(record['name'].strip()) self.__group_by_food_name = {food: group for group, food_list in self.__food_dict.items() for food in food_list} self.__hash_tree = HashTree(self.__group_by_food_name.keys()) t2 = time() logger.info('Done creating food catalog. Total time: %.2f sec.' % (t2 - t1))
def initialize(self): t1 = time() constants.logger.info('Creating diseases catalog...') data = pd.read_csv(self.diseases_csv_path, sep="\t") data = data[['id', 'name']] data_dict = data.to_dict("records") for row in data_dict: self.disease_dictionary[row['name']] = row['id'] self.hash_tree = HashTree(self.disease_dictionary.keys()) t2 = time() constants.logger.info('Done creating diseases catalog. Total time: %.2f sec.' % (t2 - t1))
def initialize(self): t1 = time() constants.logger.info('Creating nutrients catalog...') data = pd.read_csv(self.path, sep="\t") self.__nutrients_by_idname = {idname: [] for idname in data['idname'].values} for index, record in data.iterrows(): self.__nutrients_by_idname[record['idname']].append(record['name']) self.__idname_by_nutrient = {name: idname for idname, name_list in self.__nutrients_by_idname.items() for name in name_list} self.__hash_tree = HashTree(self.__idname_by_nutrient.keys()) t2 = time() constants.logger.info('Done creating nutrients catalog. Total time: %.2f sec.' % (t2 - t1))
class NutrientsCatalogNikogosov(Catalog): """Object holding nutrient ontology""" def __init__(self, path): self.path = path self.__nutrients_by_idname = None self.__idname_by_nutrient = None self.__hash_tree = None def initialize(self): t1 = time() constants.logger.info('Creating nutrients catalog...') data = pd.read_csv(self.path, sep="\t") self.__nutrients_by_idname = {idname: [] for idname in data['idname'].values} for index, record in data.iterrows(): self.__nutrients_by_idname[record['idname']].append(record['name']) self.__idname_by_nutrient = {name: idname for idname, name_list in self.__nutrients_by_idname.items() for name in name_list} self.__hash_tree = HashTree(self.__idname_by_nutrient.keys()) t2 = time() constants.logger.info('Done creating nutrients catalog. Total time: %.2f sec.' % (t2 - t1)) def find(self, sentence_text): """ Uses previously generated hash tree to search sentence for nutrient names input: sentence: sentence to search for nutrient names returns: list of nutrient_names """ nutr_names = self.__hash_tree.search(sentence_text) entities = EntityCollection([Entity(nutrient, self.__idname_by_nutrient[nutrient], NUTRIENT_TAG) for nutrient in nutr_names]) return entities def get_list(self): nutrients = [] for key, value in self.__nutrients_by_idname.items(): nutrients.append(value[0]) return nutrients
def initialize(self): """Creation of catalog object input: :param verbose: creates: self.__scientific_names: dictionary with NCBI_id as key and scientific bacteria name as value self.__bact_id_dict: dictionary with various versions of bacterial names as keys and NCBI_id as value self.hash_tree_root: root node of hash tree """ t1 = time() logger.info('Creating all bacterial catalog...') names = pd.read_table(self.all_bact_path, sep=',') names_scientific = self.sci_names(table_names=names) self.__scientific_names = names_scientific.set_index('id').T.to_dict('records')[0] self.__bact_id_dict = names[['name', 'id']].set_index('name').T.to_dict('records')[0] self.__hash_tree = HashTree(self.__bact_id_dict.keys()) t2 = time() logger.info('Done creating bacterial catalog. Total time: %.2f sec.' % (t2 - t1))
class DiseasesCatalog(Catalog): def get_list(self): pass def __str__(self): return "diseases catalog" def __init__(self, diseases_csv_path): self.disease_dictionary = {} self.hash_tree = None self.diseases_csv_path = diseases_csv_path def initialize(self): t1 = time() constants.logger.info('Creating diseases catalog...') data = pd.read_csv(self.diseases_csv_path, sep="\t") data = data[['id', 'name']] data_dict = data.to_dict("records") for row in data_dict: self.disease_dictionary[row['name']] = row['id'] self.hash_tree = HashTree(self.disease_dictionary.keys()) t2 = time() constants.logger.info('Done creating diseases catalog. Total time: %.2f sec.' % (t2 - t1)) def find(self, sentence_text): """ Uses previously generated hash tree to search sentence for nutrient names input: sentence: sentence to search for nutrient names returns: list of nutrient_names """ sentence_text = re.sub('[’\']', '', sentence_text) diseases_names = self.hash_tree.search(sentence_text) entities = EntityCollection([Entity(name, self.disease_dictionary[name], DISEASE_TAG) for name in diseases_names], DISEASE_TAG) return entities
class AllBacteriaCatalog(Catalog): """Object holding NCBI ontology""" def get_list(self): pass def __str__(self): return "all bacteria catalog" def __init__(self, all_bact_path): self.all_bact_path = all_bact_path self.__scientific_names = None self.__bact_id_dict = None self.__hash_tree = None def initialize(self): """Creation of catalog object input: :param verbose: creates: self.__scientific_names: dictionary with NCBI_id as key and scientific bacteria name as value self.__bact_id_dict: dictionary with various versions of bacterial names as keys and NCBI_id as value self.hash_tree_root: root node of hash tree """ t1 = time() logger.info('Creating all bacterial catalog...') names = pd.read_table(self.all_bact_path, sep=',') names_scientific = self.sci_names(table_names=names) self.__scientific_names = names_scientific.set_index('id').T.to_dict('records')[0] self.__bact_id_dict = names[['name', 'id']].set_index('name').T.to_dict('records')[0] self.__hash_tree = HashTree(self.__bact_id_dict.keys()) t2 = time() logger.info('Done creating bacterial catalog. Total time: %.2f sec.' % (t2 - t1)) def find(self, sentence_text): """ Uses previously generated hash tree to search sentence for bacterial names input: sentence: sentence to search for bacterial names returns: list of (bactrium_name, NCBI_id) tuples found in sentence :param sentence_text: """ bact_names = self.__hash_tree.search(sentence_text) bact_ids = [self.__bact_id_dict[name] for name in bact_names] output_list = list(zip(bact_names, bact_ids)) entities = EntityCollection([Entity(name, code, BACTERIA_TAG, [ALL_BACTERIA_TAG]) for name, code in output_list], BACTERIA_TAG) return entities def get_scientific_name(self, ncbi_id): return self.__scientific_names[ncbi_id] def sci_names(self, table_names): names_scientific = table_names.loc[(table_names['class'] == 'scientific name') & (~table_names['id'].isnull()), ['name', 'id']].drop_duplicates(subset=['id']) return names_scientific