def saveDf(data, filename): ''' save *.df files : DataFrame ''' saved_df = DataFrame.save(data, filename) print saved_df
def save_result(sample_id, result, exp_filename, abund_filename, seqs_filename): 'save the results in three Artifacts' svs, taxa, abundances = zip(*result) hashes = [ hashlib.md5((s + t).encode('utf-8')).hexdigest() for s, t, c in result ] expected = DataFrame({'Taxon': taxa}, index=hashes, columns=['Taxon']) expected.index.name = 'Feature ID' expected = Artifact.import_data('FeatureData[Taxonomy]', expected) expected.save(exp_filename) abundanced = DataFrame({h: a for h, a in zip(hashes, abundances)}, index=[sample_id], columns=hashes) abundanced = Artifact.import_data('FeatureTable[Frequency]', abundanced) abundanced.save(abund_filename) sequences = Series(svs, index=hashes) sequences = Artifact.import_data('FeatureData[Sequence]', sequences) sequences.save(seqs_filename)
def mapNewData(working_bucket, data, meta_data, anno_data,syn_file,agilent_file,network_table): """ Given local file locations for source data, meta data, annotations data, synonyms file and the agilent (probe->gene) file, Creates a new dataframe, containing only gene information for genes present in the network table, indexed by gene name, columns are sample ids Returns dataframe pickle file location and dataframe """ anno = pandas.io.parsers.read_table(anno_data) data = pandas.io.parsers.read_table(data) metadata = pandas.io.parsers.read_table(meta_data) agl = pandas.io.parsers.read_table(agilent_file) #get rid of control probes data.index = anno['ProbeName'] control_probe_names = anno['ProbeName'][anno['ControlType'] != 0] data = data.drop(control_probe_names) agl.set_index('ProbeID') agl2 = agl[agl['GeneSymbol'].notnull()] agl2 = agl2.set_index('ProbeID') #map probes to genes from network a = agl2['GeneSymbol'].tolist() b = set(a) table = Table(network_table) temp_nets = table.scan() network_genes = [] i = 0 for net in temp_nets: network_genes += net['gene_ids'][6:].split('~:~') network_genes_set = set(network_genes) mm = {} added = [] with open(syn_file,'r') as synonyms: for line in synonyms: parsed = line.split() try: temp = [] for p in parsed[:5]: tt = p.split('|') for t in tt: if len(t) > 2 and t in network_genes_set and parsed[2] in b: added.append(t) temp.append(t) if len(temp) > 0: if parsed[2] not in mm: mm[parsed[2]] = [] for t in temp: if t not in mm[parsed[2]]: mm[parsed[2]].append(t) except IndexError: pass ng2p = {} probes = [] with open(agilent_file, 'r') as gl: for line in gl: parsed = line.split() try: if parsed[2] in mm: #mouse gene is mapped to network gene for ng in mm[parsed[2]]: if ng not in ng2p: ng2p[ng] = [] if parsed[0] not in ng2p[ng]: ng2p[ng].append(parsed[0]) probes.append(parsed[0]) except IndexError: pass #create newly trimmed and annotated data frame #save pickle locally df = DataFrame(np.zeros((len(ng2p), len(data.columns))), index=ng2p.keys(), columns=data.columns) for k,v in ng2p.iteritems(): df.ix[k] = data.ix[v].median() saved = os.path.join(os.path.split(agilent_file)[0],'trimmed_dataframe.pandas') df.save(saved) #send pickled dataframe to working bucket conn = boto.connect_s3() b = conn.get_bucket(working_bucket) k=Key(b) k.key = 'trimmed_dataframe.pandas' k.storage_class = 'REDUCED_REDUNDANCY' k.set_contents_from_filename(saved) k.key = 'metadata.txt' k.storage_class = 'REDUCED_REDUNDANCY' k.set_contents_from_filename(meta_data) return saved,df
def prepare_data(path, smoothstr, smooth, exp_genotype, ctrl_genotype): df2 = DataFrame() if not os.path.exists(path + "/Speed_calculations/"): os.makedirs(path + "/Speed_calculations/") for csvfile in sorted(glob.glob(path + "/*.csv")): csvfilefn = os.path.basename(csvfile) try: experimentID, date, time = csvfilefn.split("_", 2) genotype, laser, repID = experimentID.split("-", 2) repID = repID + "_" + date print "processing: ", experimentID except: print "invalid filename:", csvfilefn continue df = pd.read_csv(csvfile, index_col=0) if not df.index.is_unique: raise Exception( "CORRUPT CSV. INDEX (NANOSECONDS SINCE EPOCH) MUST BE UNIQUE") #resample to 10ms (mean) and set a proper time index on the df df = flymad_analysis.fixup_index_and_resample(df, '10L') #smooth the positions, and recalculate the velocitys based on this. dt = flymad_analysis.kalman_smooth_dataframe(df, arena, smooth) df['laser_state'] = df['laser_state'].fillna(value=0) lasermask = df[df['laser_state'] == 1] df['tracked_t'] = df['tracked_t'] - np.min( lasermask['tracked_t'].values) #MAXIMUM SPEED = 300: df['v'][df['v'] >= 300] = np.nan #the resampling above, using the default rule of 'mean' will, if the laser #was on any time in that bin, increase the mean > 0. df['laser_state'][df['laser_state'] > 0] = 1 df['Genotype'] = genotype df['lasergroup'] = laser df['RepID'] = repID #combine 60s trials together into df2: dfshift = df.shift() laserons = df[(df['laser_state'] - dfshift['laser_state']) == 1] #SILLY BUG. Sometimes laserons contains incorrect timepoints. To fix this, I compare each laseron to the # previous and discard if ix - prev is less than the experimental looping time. prev = pd.DatetimeIndex([datetime.datetime(1986, 5, 27) ])[0] #set to random time initially for ix, row in laserons.iterrows(): before = ix - DateOffset(seconds=9.95) after = ix + DateOffset(seconds=59.95) if (ix - prev).total_seconds() <= 59.95: continue else: print "ix:", ix, "\t span:", ix - prev prev = ix dftemp = df.ix[before:after][[ 'Genotype', 'lasergroup', 'v', 'laser_state' ]] dftemp['align'] = np.linspace(0, (after - before).total_seconds(), len(dftemp)) df2 = pd.concat([df2, dftemp]) expdf = df2[df2['Genotype'] == exp_genotype] ctrldf = df2[df2['Genotype'] == ctrl_genotype] #we no longer need to group by genotype, and lasergroup is always the same here #so just drop it. assert len( expdf['lasergroup'].unique()) == 1, "only one lasergroup handled" expmean = expdf.groupby(['align'], as_index=False).mean().astype(float) ctrlmean = ctrldf.groupby(['align'], as_index=False).mean().astype(float) expstd = expdf.groupby(['align'], as_index=False).mean().astype(float) ctrlstd = ctrldf.groupby(['align'], as_index=False).mean().astype(float) expn = expdf.groupby(['align'], as_index=False).count().astype(float) ctrln = ctrldf.groupby(['align'], as_index=False).count().astype(float) ####AAAAAAAARRRRRRRRRRRGGGGGGGGGGGGGGHHHHHHHHHH so much copy paste here df2.save(path + "/df2" + smoothstr + ".df") expmean.save(path + "/expmean" + smoothstr + ".df") ctrlmean.save(path + "/ctrlmean" + smoothstr + ".df") expstd.save(path + "/expstd" + smoothstr + ".df") ctrlstd.save(path + "/ctrlstd" + smoothstr + ".df") expn.save(path + "/expn" + smoothstr + ".df") ctrln.save(path + "/ctrln" + smoothstr + ".df") return expmean, ctrlmean, expstd, ctrlstd, expn, ctrln, df2
def profile_comparative(benchmarks): from vbench.api import BenchmarkRunner from vbench.db import BenchmarkDB from vbench.git import GitRepo from suite import BUILD, DB_PATH, PREPARE, dependencies TMP_DIR = tempfile.mkdtemp() try: prprint("Opening DB at '%s'...\n" % DB_PATH) db = BenchmarkDB(DB_PATH) prprint("Initializing Runner...") # all in a good cause... GitRepo._parse_commit_log = _parse_wrapper(args.base_commit) runner = BenchmarkRunner( benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, TMP_DIR, PREPARE, always_clean=True, # run_option='eod', start_date=START_DATE, module_dependencies=dependencies) repo = runner.repo # (steal the parsed git repo used by runner) h_head = args.target_commit or repo.shas[-1] h_baseline = args.base_commit # ARGH. reparse the repo, without discarding any commits, # then overwrite the previous parse results # prprint ("Slaughtering kittens..." ) (repo.shas, repo.messages, repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH, args.base_commit) prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, ""))) prprint('Baseline [%s] : %s\n' % (h_baseline, repo.messages.get(h_baseline, ""))) prprint("Removing any previous measurements for the commits.") db.delete_rev_results(h_baseline) db.delete_rev_results(h_head) # TODO: we could skip this, but we need to make sure all # results are in the DB, which is a little tricky with # start dates and so on. prprint("Running benchmarks for baseline [%s]" % h_baseline) runner._run_and_write_results(h_baseline) prprint("Running benchmarks for target [%s]" % h_head) runner._run_and_write_results(h_head) prprint('Processing results...') head_res = get_results_df(db, h_head) baseline_res = get_results_df(db, h_baseline) ratio = head_res['timing'] / baseline_res['timing'] totals = DataFrame({HEAD_COL:head_res['timing'], BASE_COL:baseline_res['timing'], 'ratio':ratio, 'name':baseline_res.name}, columns=[HEAD_COL, BASE_COL, "ratio", "name"]) totals = totals.ix[totals[HEAD_COL] > args.min_duration] # ignore below threshold totals = totals.dropna( ).sort("ratio").set_index('name') # sort in ascending order h_msg = repo.messages.get(h_head, "") b_msg = repo.messages.get(h_baseline, "") print_report(totals,h_head=h_head,h_msg=h_msg, h_baseline=h_baseline,b_msg=b_msg) if args.outdf: prprint("The results DataFrame was written to '%s'\n" % args.outdf) totals.save(args.outdf) finally: # print("Disposing of TMP_DIR: %s" % TMP_DIR) shutil.rmtree(TMP_DIR)
def profile_comparative(benchmarks): from vbench.api import BenchmarkRunner from vbench.db import BenchmarkDB from vbench.git import GitRepo from suite import BUILD, DB_PATH, PREPARE, dependencies TMP_DIR = tempfile.mkdtemp() try: prprint("Opening DB at '%s'...\n" % DB_PATH) db = BenchmarkDB(DB_PATH) prprint("Initializing Runner...") # all in a good cause... GitRepo._parse_commit_log = _parse_wrapper(args.base_commit) runner = BenchmarkRunner( benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, TMP_DIR, PREPARE, always_clean=True, # run_option='eod', start_date=START_DATE, module_dependencies=dependencies) repo = runner.repo # (steal the parsed git repo used by runner) h_head = args.target_commit or repo.shas[-1] h_baseline = args.base_commit # ARGH. reparse the repo, without discarding any commits, # then overwrite the previous parse results # prprint ("Slaughtering kittens..." ) (repo.shas, repo.messages, repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH, args.base_commit) prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, ""))) prprint('Baseline [%s] : %s\n' % (h_baseline, repo.messages.get(h_baseline, ""))) prprint("removing any previous measurements for the commits.") db.delete_rev_results(h_baseline) db.delete_rev_results(h_head) # TODO: we could skip this, but we need to make sure all # results are in the DB, which is a little tricky with # start dates and so on. prprint("Running benchmarks for baseline [%s]" % h_baseline) runner._run_and_write_results(h_baseline) prprint("Running benchmarks for target [%s]" % h_head) runner._run_and_write_results(h_head) prprint('Processing results...') head_res = get_results_df(db, h_head) baseline_res = get_results_df(db, h_baseline) ratio = head_res['timing'] / baseline_res['timing'] totals = DataFrame({HEAD_COL:head_res['timing'], BASE_COL:baseline_res['timing'], 'ratio':ratio, 'name':baseline_res.name}, columns=[HEAD_COL, BASE_COL, "ratio", "name"]) totals = totals.ix[totals[HEAD_COL] > args.min_duration] # ignore below threshold totals = totals.dropna( ).sort("ratio").set_index('name') # sort in ascending order h_msg = repo.messages.get(h_head, "") b_msg = repo.messages.get(h_baseline, "") print_report(totals,h_head=h_head,h_msg=h_msg, h_baseline=h_baseline,b_msg=b_msg) if args.outdf: prprint("The results DataFrame was written to '%s'\n" % args.outdf) totals.save(args.outdf) finally: # print("Disposing of TMP_DIR: %s" % TMP_DIR) shutil.rmtree(TMP_DIR)
data = [('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)] stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" con.executemany(stmt, data) con.commit() # You can pass the list of tuples to the DataFrame constructor, but you also need the column names, contained in the cursor's description attribute print cursor.description print DataFrame(rows, columns=zip(*cursor.description)[0]) # This is a bit of munging that you'd rather not repeat each time you query the database. Pandas has a read_frame function in its pandas.io.sql module that simplifies the process. Just pass the select statement and the connection object import pandas.io.sql as sql print sql.read_frame('select * from test', con) # Storing and Loading Data in MongoDB import pymongo con = pymongo.Connection('localhost', port=27017) # Documents stored in MongoDB are found in collections inside databases. Each running instance of the MongoDB server can have multiple databases and each database can have multiple collections. An example, to store the Twitter API data from earlier in the chapter. # Access the (currently empty) tweets collection: tweets = con.db.tweets # Load the list of tweets and write each of them to the collection using tweets.save (writes the Python dict to MongoDB): import requests, json url = 'http://search.twitter.com/search.json?q=python%20pandas' data = json.loads(requests.get(url).text) for tweet in data['results']: tweets.save(tweet) # If you wanted all my tweets from collection, you can query the collection with the following: cursor = tweets.find({ 'from_user': '******' }) # the cursor returned is an iterator that yields each document as a dict. # You can convert this into a DataFrame, optionally extracting a subset of the data fields in each tweet: tweet_fields = ['created_at', 'from_user', 'id', 'text'] result = DataFrame(list(cursor), columns=tweet_fields)
def prepare_data(path, smoothstr, smooth, exp_genotype, ctrl_genotype): df2 = DataFrame() if not os.path.exists(path + "/Speed_calculations/"): os.makedirs(path + "/Speed_calculations/") for csvfile in sorted(glob.glob(path + "/*.csv")): csvfilefn = os.path.basename(csvfile) try: experimentID,date,time = csvfilefn.split("_",2) genotype,laser,repID = experimentID.split("-",2) repID = repID + "_" + date print "processing: ", experimentID except: print "invalid filename:", csvfilefn continue df = pd.read_csv(csvfile, index_col=0) if not df.index.is_unique: raise Exception("CORRUPT CSV. INDEX (NANOSECONDS SINCE EPOCH) MUST BE UNIQUE") #resample to 10ms (mean) and set a proper time index on the df df = flymad_analysis.fixup_index_and_resample(df, '10L') #smooth the positions, and recalculate the velocitys based on this. dt = flymad_analysis.kalman_smooth_dataframe(df, arena, smooth) df['laser_state'] = df['laser_state'].fillna(value=0) lasermask = df[df['laser_state'] == 1] df['tracked_t'] = df['tracked_t'] - np.min(lasermask['tracked_t'].values) #MAXIMUM SPEED = 300: df['v'][df['v'] >= 300] = np.nan #the resampling above, using the default rule of 'mean' will, if the laser #was on any time in that bin, increase the mean > 0. df['laser_state'][df['laser_state'] > 0] = 1 df['Genotype'] = genotype df['lasergroup'] = laser df['RepID'] = repID #combine 60s trials together into df2: dfshift = df.shift() laserons = df[ (df['laser_state']-dfshift['laser_state']) == 1 ] #SILLY BUG. Sometimes laserons contains incorrect timepoints. To fix this, I compare each laseron to the # previous and discard if ix - prev is less than the experimental looping time. prev = pd.DatetimeIndex([datetime.datetime(1986, 5, 27)])[0] #set to random time initially for ix,row in laserons.iterrows(): before = ix - DateOffset(seconds=9.95) after = ix + DateOffset(seconds=59.95) if (ix - prev).total_seconds() <= 59.95: continue else: print "ix:", ix, "\t span:", ix - prev prev = ix dftemp = df.ix[before:after][['Genotype', 'lasergroup','v', 'laser_state']] dftemp['align'] = np.linspace(0,(after-before).total_seconds(),len(dftemp)) df2 = pd.concat([df2, dftemp]) expdf = df2[df2['Genotype'] == exp_genotype] ctrldf = df2[df2['Genotype']== ctrl_genotype] #we no longer need to group by genotype, and lasergroup is always the same here #so just drop it. assert len(expdf['lasergroup'].unique()) == 1, "only one lasergroup handled" expmean = expdf.groupby(['align'], as_index=False).mean().astype(float) ctrlmean = ctrldf.groupby(['align'], as_index=False).mean().astype(float) expstd = expdf.groupby(['align'], as_index=False).mean().astype(float) ctrlstd = ctrldf.groupby(['align'], as_index=False).mean().astype(float) expn = expdf.groupby(['align'], as_index=False).count().astype(float) ctrln = ctrldf.groupby(['align'], as_index=False).count().astype(float) ####AAAAAAAARRRRRRRRRRRGGGGGGGGGGGGGGHHHHHHHHHH so much copy paste here df2.save(path + "/df2" + smoothstr + ".df") expmean.save(path + "/expmean" + smoothstr + ".df") ctrlmean.save(path + "/ctrlmean" + smoothstr + ".df") expstd.save(path + "/expstd" + smoothstr + ".df") ctrlstd.save(path + "/ctrlstd" + smoothstr + ".df") expn.save(path + "/expn" + smoothstr + ".df") ctrln.save(path + "/ctrln" + smoothstr + ".df") return expmean, ctrlmean, expstd, ctrlstd, expn, ctrln, df2