Exemplo n.º 1
0
def main(args):
    if len(args.files) > 0:  # Data files passed on command line.
        if len(args.files) > 1:  # Demultiplexed data files.
            ac, oc, meta = process_fastq_files(args)
            expdefs = { None: [(file2col(f),) for f in args.files]}  # Use file names.
        else:  # Single multiplexed data file.
            if args.expdefs is None:
                print "Experiment definitions required."
                return 1
            with open(args.expdefs, 'rb') as f:
                expdefs = pik.load(f)
            ac, oc, meta = process_fastq_files(args, [v[0][0] for v in expdefs.values()])
        if args.cache is not None:  # Use cache to store parsed data.
            store = pytables.HDFStore(args.cache)
            store['metadata'] = meta
            store['allele_counts'] = ac
            store['other_counts'] = oc
            store.close()
    elif args.cache is not None:  # Load data from cache.
        cache = pytables.HDFStore(args.cache)
        ac = cache['allele_counts']
        oc = cache['other_counts']
        meta = cache['metadata']
        cache.close()
        if args.expdefs is None:
            expdefs = {None: [(c,) for c in list(ac.columns)]}
        else:
            with open(args.expdefs, 'rb') as f:
                expdefs = pik.load(f)
    else:
        print "Specify at least one data source."
        return 1

    data = convert_data(ac, meta)

    if args.cache is not None:
        cache = pytables.HDFStore(args.cache)
        if 'fitness' in cache and not args.recompute:  # Load cached fitness data.
            fitness = cache['fitness']
        else:  # Recompute and cache fitness data.
            fitness = pd.Panel(compute_fitness(args, data, expdefs))
            cache['fitness'] = fitness
        cache.close()
    else:  # Compute fitness data on-the-fly.
        fitness = compute_fitness(args, data, expdefs)

    if args.out is not None:  # Draw figures in output directory.
        draw_hist(fitness, os.path.join(args.out, 'fitmap_hist.png'))
        idx = meta.reset_index().set_index(['Pos', 'AA', 'Codon'])
        aa = list(set(meta['AA']) - {None, np.nan})
        cod = list(set(meta['Codon']) - {None, np.nan})
        pos = list(set(meta['Pos']) - {None, np.nan, 0})
        aamaps = compute_hmap(fitness['slope'], pos, aa, 'Pos', 'AA', idx, [np.nanmedian, np.nanstd])
        codmaps = compute_hmap(fitness['slope'], pos, cod, 'Pos', 'Codon', idx, [np.nanmedian, np.nanstd])
        draw_hmap(aamaps[0], aa, os.path.join(args.out, 'fitmap_aa_median.png'))
        draw_hmap(codmaps[0], cod, os.path.join(args.out, 'fitmap_cod_median.png'))

    return 0
def time_data_retrieval(number_of_tests=1):

    """ Runs some timeit tests on one day of github data stored in
    - python pickle file
    - pytables hd5
    - mysql table
    - mongodb
    """
    gh_csv = 'data/oneday.csv'
    gh_pick = 'data/oneday.pyd'
    gh_hd5 = 'data/git.h5'

    csv_timer = timeit.Timer(lambda: load_csv_df(gh_csv))
    print('csv:', csv_timer.timeit(number_of_tests))

    pickle_timer = timeit.Timer(lambda: load_pickle_df(gh_pick))
    print('pickle:', pickle_timer.timeit(number_of_tests))

    con = mysql_setup()
    query = 'select * from git.oneday;'
    mysql_timer = timeit.Timer(lambda:load_mysql_df(con, query))
    print('mysql:', mysql_timer.timeit(number_of_tests))

    store = pyt.HDFStore(gh_hd5)
    hd5_timer = timeit.Timer(lambda: load_hd5_df(store))
    print('hd5:', hd5_timer.timeit(number_of_tests))

    client = MongoClient()
    dbm = client['git']
    mong_timer = timeit.Timer(lambda: load_mongo_df(dbm))
    print('mongodb:', mong_timer.timeit(number_of_tests))
    
    test_df = load_hd5_df(store)
    print('Test data size: ', test_df.shape)
def setup_test_data():
    """Function uses sample githubarchive data.
    It saves a few hundred copies as a csv, as a python pickle file,
    as a hdf5 store, as mysql table and as mondodb.
    If you haven't run  timing tests before, you will need
    to run this first.
    """
    print 'use one hour of sample and replicate 100 times'
    #use only the repository data --
    onehr_df = ghd.load_local_archive_dataframe()
    onehr_json = ghd.load_local_archive_json()
    one_hr_repo_df = ghd.unnest_git_json(onehr_df)['repository']
    many_hr_repo_df = pn.DataFrame()
    for i in range(1,100):
        many_hr_repo_df = many_hr_repo_df.append(one_hr_repo_df)
    print('saving dataframe with', many_hr_repo_df.shape, "rows")
    print 'saving data to a csv file'
    many_hr_repo_df.to_csv('data/oneday.csv', encoding='utf-8')
    print 'dumping data to python pickle'
    pickle.dump(many_hr_repo_df, open('data/oneday.pyd', 'wb'))
    print 'dumping data to mysql database'
    con = mysql_setup()
    many_hr_repo_df_clean = many_hr_repo_df.fillna('')
    sql.write_frame(many_hr_repo_df_clean, 'oneday', con, 'mysql')
    print 'saving data to hdf5 filestore'
    store = pyt.HDFStore('data/git.h5')
    store.put('oneday', many_hr_repo_df)
    print 'saving data to mongodb'
    # repos_son = onehr_df['repository']
    many_hr_repo_df = many_hr_repo_df.dropna()    
    client = MongoClient()
    dbm = client['git']
    collection = dbm['gittest']
    # many_hr_repo_df = many_hr_repo_df.set_index(many_hr_repo_df.name)
    [collection.insert(onehr_json) for i in range(1,100)]
Exemplo n.º 4
0
 def load_saved(cls, save_path):
     obj = cls()
     s = pt.HDFStore(save_path)
     obj.dframes = dict([(k[1:], s[k]) for k in s.keys()])
     s.close()
     return obj
Exemplo n.º 5
0
 def save(self, save_path):
     s = pt.HDFStore(save_path)
     for k in self.dframes.keys():
         s[k] = self.dframes[k]
     s.close()
Exemplo n.º 6
0
#!/usr/bin/env python2.7
from pandas.io import pytables
import sys

store = pytables.HDFStore(sys.argv[1])
print store
store.close()
Exemplo n.º 7
0
    def get_hdf5(self):

        store = tab.HDFStore('datastore.h5')
        return store['prices']
Exemplo n.º 8
0
    def store_hdf5(self, l_sym, start_date, end_date):
        df_prices = self.get_yahoo_data(l_sym, start_date, end_date)

        store = tab.HDFStore('datastore.h5')
        store['prices'] = df_prices
        store.close()