def get_file_labels(query, filters_dict, keys=['body_site', 'title', 'path'], filter_keys=['sampleID', 'title', 'body_site']):
    """
    args:
        query: A SQL query that returns a table containing filepaths along with file ids (paper title)
        filters_dict: A dict of the form { label : q}, where the results of SQL query q will have label 'label'. Additionally, the result
            of this query should have file_id as one of the columns.
    returns:
        Returns a dict mapping { filepaths : df }, where df is a DataFrame with columns row_id and labels
    """
    conn = butterfree.get_connection()
    curr = conn.cursor()
    # Execute query
    curr.execute(query)
    qf = pd.DataFrame(curr.fetchall(), columns=keys)
    # Execute filter queries
    curr.close()
    label_df = pd.DataFrame(columns=filter_keys+['label'])
    for label, f in filters_dict.items():
        curr = conn.cursor()
        curr.execute(f)
        filters = pd.DataFrame(curr.fetchall(), columns=filter_keys)
        filters['label'] = label
        label_df = label_df.append(filters)
        curr.close()

    # Get body sites and paths associated with titles
    body_sites_dict = {row['path']: row[['body_site', 'title']] for _, row in qf.iterrows()}
    labels_dict = {}
    for path, row in body_sites_dict.items():
        matched = (label_df['title'] == row['title'])*(label_df['body_site'] == row['body_site']) # Get rows where title and body site match
        if sum(matched) > 0:
            extracted = label_df[matched]
            labels_dict[path] = extracted[['sampleID', 'label']]
    return labels_dict
def get_row_labels(query, filters_dict, keys=['body_site', 'title', 'path'], filter_keys=['sampleID', 'title', 'body_site']):
    """
    args:
        query: A SQL query that returns a table containing filepaths along with file ids (paper title)
        filters_dict: A dict of the form { label : q}, where the results of SQL query q will have label 'label'. Additionally, the result
            of this query should have file_id as one of the columns.
    returns:
        Returns a dict mapping { labels : row_ids }
    """
    conn = butterfree.get_connection()
    curr = conn.cursor()
    # Execute query
    curr.execute(query)
    qf = pd.DataFrame(curr.fetchall(), columns=keys)
    # Execute filter queries
    curr.close()
    labels_dict = {}
    for label, f in filters_dict.items():
        curr = conn.cursor()
        curr.execute(f)
        filters = pd.DataFrame(curr.fetchall(), columns=filter_keys)
        labels_dict[label] = filters['sampleID'].tolist()
        curr.close()

    return labels_dict
def get_all_columns(annotation):
    conn = butterfree.get_connection()
    curr = conn.cursor()
    keys = ['body_site', 'title', 'path']
    curr.execute("SELECT {0} FROM annotations where annotation = '{1}';".format(', '.join(keys), annotation))
    references = pd.DataFrame(curr.fetchall(), columns = keys)
    columns = load_annotations(references['path'])

    return columns
Пример #4
0
def upload_references(drop_previous=True):

    table_name = 'annotations'
    if drop_previous:
        conn = butterfree.get_connection()
        drop_table(table_name, conn)
    tracker = TrackerCSV()
    writer = Writer([tracker], conn)
    writer.write_tables()
    print("Wrote references to table {0}".format(table_name))
def get_unique_phenotypes(column):
    """ Returns all unique values of a phenotype from a chosen column in the 'phenotypes' table. """
    conn = butterfree.get_connection()
    curr = conn.cursor()
    curr.execute("SELECT DISTINCT {0} FROM phenotypes;".format(column))
    fetched = curr.fetchall()
    result = []
    for f in fetched:
        if f[0]:
            result.extend(f[0].split(';'))
    result = list(set(result))
    return result
Пример #6
0
def upload_otus(datasets=butterfree.datasets, drop_previous=True):

    # NOTE: This does not work bc postgres column limit.
    print("This does not work because of the postgres column limit. Cancelling request.")
    return
    if drop_previous:
        conn = butterfree.get_connection()
        for annotation in annotations:
            drop_table(annotation, conn)

    n = len(args['path'])
    for key in args:
        assert len(args[key]) == n
    otus = []
    for i in range(n):
        otus.append(AnnotationCSV(args['path'][i], args['title'][i], args['body_site'][i], args['annotation'][i]))
    writer = Writer(otus)
    writer.write_tables()
Пример #7
0
def upload_curated_csvs(datasets=butterfree.datasets, drop_previous=True):

    if drop_previous:
        conn = butterfree.get_connection()
        drop_table('phenotypes', conn)
        drop_table('downloads', conn)

    phenotypes = []
    ncbi = []
    preprocess_all()
    for dataset in datasets:
        df = pd.read_csv(os.path.join(butterfree.interim_dir, dataset + '_phenoData.csv'))
        phenotypes.append(PhenotypeCSV(df, dataset))
        ncbi.append(DownloadsCSV(df, dataset))
    pheno_writer = Writer(phenotypes, conn, backend='postgresql')
    pheno_writer.write_tables()
    ncbi_writer = Writer(ncbi, conn)
    ncbi_writer.write_tables()
def get_row_ids(query, filters_dict, keys=['body_site', 'title', 'path'], filter_keys=['sampleID', 'title', 'body_site']):
    """
    args:
        query: A SQL query that returns a table containing filepaths along with file ids (paper title)
        filters_dict: A dict of the form { label : q}, where the results of SQL query q will have label 'label'. Additionally, the result
            of this query should have file_id as one of the columns.
    returns:
        Returns a dict mapping { filepaths : row ids }
    """
    conn = butterfree.get_connection()
    curr = conn.cursor()
    # Execute query
    curr.execute(query)
    qf = pd.DataFrame(curr.fetchall(), columns=keys)
    # Execute filter queries
    curr.close()
    filters = pd.DataFrame(columns=filter_keys)
    for f in filters_dict.values():
        curr = conn.cursor()
        curr.execute(f)
        filters = filters.append(pd.DataFrame(curr.fetchall(), columns=filter_keys))
        curr.close()
    # Match columns
    conn.close()
    filters['path'] = ['' for _ in range(len(filters))]
    merged = pd.DataFrame(columns=['body_site', 'title', 'path', 'sampleID'])
    for _, row in qf.iterrows():
        matched = filters.loc[filters['title'] == row['title']]
        matched['path'] = row['path']
        matched['body_site'] = row['body_site']
        merged = merged.append(matched)

    row_ids = {path: [] for path in set(merged.path)}
    for _, row in merged.iterrows():
        row_ids[row['path']].append(row['sampleID'])

    return row_ids