def get_file_labels(query, filters_dict, keys=['body_site', 'title', 'path'], filter_keys=['sampleID', 'title', 'body_site']): """ args: query: A SQL query that returns a table containing filepaths along with file ids (paper title) filters_dict: A dict of the form { label : q}, where the results of SQL query q will have label 'label'. Additionally, the result of this query should have file_id as one of the columns. returns: Returns a dict mapping { filepaths : df }, where df is a DataFrame with columns row_id and labels """ conn = butterfree.get_connection() curr = conn.cursor() # Execute query curr.execute(query) qf = pd.DataFrame(curr.fetchall(), columns=keys) # Execute filter queries curr.close() label_df = pd.DataFrame(columns=filter_keys+['label']) for label, f in filters_dict.items(): curr = conn.cursor() curr.execute(f) filters = pd.DataFrame(curr.fetchall(), columns=filter_keys) filters['label'] = label label_df = label_df.append(filters) curr.close() # Get body sites and paths associated with titles body_sites_dict = {row['path']: row[['body_site', 'title']] for _, row in qf.iterrows()} labels_dict = {} for path, row in body_sites_dict.items(): matched = (label_df['title'] == row['title'])*(label_df['body_site'] == row['body_site']) # Get rows where title and body site match if sum(matched) > 0: extracted = label_df[matched] labels_dict[path] = extracted[['sampleID', 'label']] return labels_dict
def get_row_labels(query, filters_dict, keys=['body_site', 'title', 'path'], filter_keys=['sampleID', 'title', 'body_site']): """ args: query: A SQL query that returns a table containing filepaths along with file ids (paper title) filters_dict: A dict of the form { label : q}, where the results of SQL query q will have label 'label'. Additionally, the result of this query should have file_id as one of the columns. returns: Returns a dict mapping { labels : row_ids } """ conn = butterfree.get_connection() curr = conn.cursor() # Execute query curr.execute(query) qf = pd.DataFrame(curr.fetchall(), columns=keys) # Execute filter queries curr.close() labels_dict = {} for label, f in filters_dict.items(): curr = conn.cursor() curr.execute(f) filters = pd.DataFrame(curr.fetchall(), columns=filter_keys) labels_dict[label] = filters['sampleID'].tolist() curr.close() return labels_dict
def get_all_columns(annotation): conn = butterfree.get_connection() curr = conn.cursor() keys = ['body_site', 'title', 'path'] curr.execute("SELECT {0} FROM annotations where annotation = '{1}';".format(', '.join(keys), annotation)) references = pd.DataFrame(curr.fetchall(), columns = keys) columns = load_annotations(references['path']) return columns
def upload_references(drop_previous=True): table_name = 'annotations' if drop_previous: conn = butterfree.get_connection() drop_table(table_name, conn) tracker = TrackerCSV() writer = Writer([tracker], conn) writer.write_tables() print("Wrote references to table {0}".format(table_name))
def get_unique_phenotypes(column): """ Returns all unique values of a phenotype from a chosen column in the 'phenotypes' table. """ conn = butterfree.get_connection() curr = conn.cursor() curr.execute("SELECT DISTINCT {0} FROM phenotypes;".format(column)) fetched = curr.fetchall() result = [] for f in fetched: if f[0]: result.extend(f[0].split(';')) result = list(set(result)) return result
def upload_otus(datasets=butterfree.datasets, drop_previous=True): # NOTE: This does not work bc postgres column limit. print("This does not work because of the postgres column limit. Cancelling request.") return if drop_previous: conn = butterfree.get_connection() for annotation in annotations: drop_table(annotation, conn) n = len(args['path']) for key in args: assert len(args[key]) == n otus = [] for i in range(n): otus.append(AnnotationCSV(args['path'][i], args['title'][i], args['body_site'][i], args['annotation'][i])) writer = Writer(otus) writer.write_tables()
def upload_curated_csvs(datasets=butterfree.datasets, drop_previous=True): if drop_previous: conn = butterfree.get_connection() drop_table('phenotypes', conn) drop_table('downloads', conn) phenotypes = [] ncbi = [] preprocess_all() for dataset in datasets: df = pd.read_csv(os.path.join(butterfree.interim_dir, dataset + '_phenoData.csv')) phenotypes.append(PhenotypeCSV(df, dataset)) ncbi.append(DownloadsCSV(df, dataset)) pheno_writer = Writer(phenotypes, conn, backend='postgresql') pheno_writer.write_tables() ncbi_writer = Writer(ncbi, conn) ncbi_writer.write_tables()
def get_row_ids(query, filters_dict, keys=['body_site', 'title', 'path'], filter_keys=['sampleID', 'title', 'body_site']): """ args: query: A SQL query that returns a table containing filepaths along with file ids (paper title) filters_dict: A dict of the form { label : q}, where the results of SQL query q will have label 'label'. Additionally, the result of this query should have file_id as one of the columns. returns: Returns a dict mapping { filepaths : row ids } """ conn = butterfree.get_connection() curr = conn.cursor() # Execute query curr.execute(query) qf = pd.DataFrame(curr.fetchall(), columns=keys) # Execute filter queries curr.close() filters = pd.DataFrame(columns=filter_keys) for f in filters_dict.values(): curr = conn.cursor() curr.execute(f) filters = filters.append(pd.DataFrame(curr.fetchall(), columns=filter_keys)) curr.close() # Match columns conn.close() filters['path'] = ['' for _ in range(len(filters))] merged = pd.DataFrame(columns=['body_site', 'title', 'path', 'sampleID']) for _, row in qf.iterrows(): matched = filters.loc[filters['title'] == row['title']] matched['path'] = row['path'] matched['body_site'] = row['body_site'] merged = merged.append(matched) row_ids = {path: [] for path in set(merged.path)} for _, row in merged.iterrows(): row_ids[row['path']].append(row['sampleID']) return row_ids