def create_bacterial_df(nodes_ncbi_path, names_ncbi_path): ncbi_names_iter = pd.read_table(names_ncbi_path, names=['id', 'name', 'class'], usecols=[0, 2, 6], header=None, chunksize=CHUNK_SIZE) ncbi_nodes = pd.read_table(nodes_ncbi_path, names=['id', 'parent_id', 'rank'], usecols=[0, 2, 4], header=None) ncbi_names = pd.concat([chunk[~(chunk['class'].isin(CLASS_EXCLUSIONS))] for chunk in ncbi_names_iter]) ids_all = create_all_bact_catalog(ncbi_nodes) names_all_bact = ncbi_names[ncbi_names['id'].isin(ids_all['id'].tolist())] names_all_bact = pd.merge(names_all_bact, ids_all, how='left', on='id', copy=False).drop_duplicates('name') names_all_bact = generate_excessive_dictionary_bact(names_all_bact) names_all_bact = names_all_bact.drop_duplicates(subset=['name']) names_all_bact['id'] = names_all_bact['id'].astype(int) #description = create_scientific_table(ncbi_nodes, ncbi_names) return [names_all_bact]#, description]
def create_gut_bacterial_df(nodes_ncbi_path, names_ncbi_path, gut_bact_list_path): SINTETIC_ID = 1000000000 gut_names = pd.read_table(gut_bact_list_path, names=['name'], sep=',') ncbi_names_iter = pd.read_table(names_ncbi_path, names=['id', 'name', 'class'], usecols=[0, 2, 6], header=None, chunksize=CHUNK_SIZE) ncbi_nodes = pd.read_table(nodes_ncbi_path, names=['id', 'parent_id', 'rank'], usecols=[0, 2, 4], header=None) ncbi_names = pd.concat([chunk[~(chunk['class'].isin(CLASS_EXCLUSIONS))] for chunk in ncbi_names_iter]) gut_names_first = gut_names['name'].apply(lambda x: str.split(x, ' ')[0]) gut_names = pd.merge(gut_names, ncbi_names[['name', 'id']], how='left', on='name') # if this names were not found in ncbi base############ gut_names_unknown = gut_names[np.isnan(gut_names['id'])].copy() gut_names.loc[np.isnan(gut_names['id']), 'name'] = gut_names_first[np.isnan(gut_names['id'])] gut_names = pd.merge(gut_names[['name']], ncbi_names, how='left', on='name') gut_names_unknown = pd.concat([gut_names_unknown, gut_names[np.isnan(gut_names['id'])].copy()]) gut_names_unknown['id'] = range(SINTETIC_ID, SINTETIC_ID + len(gut_names_unknown)) gut_names_unknown['class'] = 'unknown' gut_names_unknown['rank'] = 'unknown' ###################### gut_names = gut_names[~np.isnan(gut_names['id'])] gut_names = gut_names.drop_duplicates(subset='id') gut_ids = clear_ids_by_rank(gut_names['id'].values, ncbi_nodes) gut_parent_ids = get_bind_ids(gut_ids['id'].values, ncbi_nodes) gut_ids_table = pd.concat([gut_parent_ids, gut_ids]).drop_duplicates('id') gut_names = ncbi_names[ncbi_names['id'].isin(gut_ids_table['id'].tolist())] gut_names = pd.merge(gut_names, gut_ids_table, how='left', on='id', copy=False).drop_duplicates('name') gut_names = pd.concat([gut_names, gut_names_unknown]) gut_names = generate_excessive_dictionary_bact(gut_names) gut_names = gut_names.drop_duplicates(subset=['name']) gut_names['id'] = gut_names['id'].astype(int) return gut_names