Пример #1
0
    def saveDf(data, filename):
        '''
        save *.df files
        : DataFrame
        '''

        saved_df = DataFrame.save(data, filename)
        print saved_df
Пример #2
0
def save_result(sample_id, result, exp_filename, abund_filename,
                seqs_filename):
    'save the results in three Artifacts'
    svs, taxa, abundances = zip(*result)
    hashes = [
        hashlib.md5((s + t).encode('utf-8')).hexdigest() for s, t, c in result
    ]

    expected = DataFrame({'Taxon': taxa}, index=hashes, columns=['Taxon'])
    expected.index.name = 'Feature ID'
    expected = Artifact.import_data('FeatureData[Taxonomy]', expected)
    expected.save(exp_filename)

    abundanced = DataFrame({h: a
                            for h, a in zip(hashes, abundances)},
                           index=[sample_id],
                           columns=hashes)
    abundanced = Artifact.import_data('FeatureTable[Frequency]', abundanced)
    abundanced.save(abund_filename)

    sequences = Series(svs, index=hashes)
    sequences = Artifact.import_data('FeatureData[Sequence]', sequences)
    sequences.save(seqs_filename)
Пример #3
0
def mapNewData(working_bucket, data, meta_data, anno_data,syn_file,agilent_file,network_table):
    """
    Given local file locations for source data, meta data, annotations data,
        synonyms file and the agilent (probe->gene) file,
    Creates a new dataframe, containing only gene information for genes
        present in the network table, indexed by gene name, columns are sample ids
    Returns dataframe pickle file location and dataframe
    """
    anno = pandas.io.parsers.read_table(anno_data)
    data = pandas.io.parsers.read_table(data)
    metadata = pandas.io.parsers.read_table(meta_data)
    agl = pandas.io.parsers.read_table(agilent_file)
    
    #get rid of control probes

    data.index = anno['ProbeName']
    control_probe_names = anno['ProbeName'][anno['ControlType'] != 0]
    data = data.drop(control_probe_names)

    agl.set_index('ProbeID')
    agl2 = agl[agl['GeneSymbol'].notnull()]
    agl2 = agl2.set_index('ProbeID')

    #map probes to genes from network

    a = agl2['GeneSymbol'].tolist()
    b = set(a)
    table = Table(network_table)
    temp_nets = table.scan()
    network_genes = []
    i = 0
    for net in temp_nets:
        network_genes += net['gene_ids'][6:].split('~:~')
    network_genes_set = set(network_genes)


    mm = {}
    added = []
    with open(syn_file,'r') as synonyms:
        for line in synonyms:
            parsed = line.split()
            try:
                temp = []
                for p in parsed[:5]:
                    tt = p.split('|')
                    for t in tt:
                        if len(t) > 2 and t in network_genes_set and parsed[2] in b:
                            added.append(t)
                            temp.append(t)
                if len(temp) > 0:
                    if parsed[2] not in mm:
                      mm[parsed[2]] = []
                    for t in temp:
                        if t not in mm[parsed[2]]:
                            mm[parsed[2]].append(t)
                
            except IndexError:
                pass
    ng2p = {}
    probes = []
    with open(agilent_file, 'r') as gl:
        for line in gl:
            parsed = line.split()
            try:
                if parsed[2] in mm: #mouse gene is mapped to network gene
                    for ng in mm[parsed[2]]:
                        if ng not in ng2p:
                            ng2p[ng] = []
                        if parsed[0] not in ng2p[ng]:
                            ng2p[ng].append(parsed[0])
                            probes.append(parsed[0])          
            except IndexError:
                pass
    #create newly trimmed and annotated data frame
    #save pickle locally

    df = DataFrame(np.zeros((len(ng2p), len(data.columns))), index=ng2p.keys(), columns=data.columns)
    for k,v in ng2p.iteritems():
        df.ix[k] = data.ix[v].median()
    saved = os.path.join(os.path.split(agilent_file)[0],'trimmed_dataframe.pandas')
    df.save(saved)
    
    #send pickled dataframe to working bucket
    conn = boto.connect_s3()
    b = conn.get_bucket(working_bucket)
    k=Key(b)
    k.key = 'trimmed_dataframe.pandas'
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename(saved)

    k.key = 'metadata.txt'
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename(meta_data)

    return saved,df 
Пример #4
0
def prepare_data(path, smoothstr, smooth, exp_genotype, ctrl_genotype):

    df2 = DataFrame()
    if not os.path.exists(path + "/Speed_calculations/"):
        os.makedirs(path + "/Speed_calculations/")
    for csvfile in sorted(glob.glob(path + "/*.csv")):
        csvfilefn = os.path.basename(csvfile)
        try:
            experimentID, date, time = csvfilefn.split("_", 2)
            genotype, laser, repID = experimentID.split("-", 2)
            repID = repID + "_" + date
            print "processing: ", experimentID
        except:
            print "invalid filename:", csvfilefn
            continue
        df = pd.read_csv(csvfile, index_col=0)

        if not df.index.is_unique:
            raise Exception(
                "CORRUPT CSV. INDEX (NANOSECONDS SINCE EPOCH) MUST BE UNIQUE")

        #resample to 10ms (mean) and set a proper time index on the df
        df = flymad_analysis.fixup_index_and_resample(df, '10L')

        #smooth the positions, and recalculate the velocitys based on this.
        dt = flymad_analysis.kalman_smooth_dataframe(df, arena, smooth)

        df['laser_state'] = df['laser_state'].fillna(value=0)

        lasermask = df[df['laser_state'] == 1]
        df['tracked_t'] = df['tracked_t'] - np.min(
            lasermask['tracked_t'].values)

        #MAXIMUM SPEED = 300:
        df['v'][df['v'] >= 300] = np.nan

        #the resampling above, using the default rule of 'mean' will, if the laser
        #was on any time in that bin, increase the mean > 0.
        df['laser_state'][df['laser_state'] > 0] = 1

        df['Genotype'] = genotype
        df['lasergroup'] = laser
        df['RepID'] = repID

        #combine 60s trials together into df2:
        dfshift = df.shift()
        laserons = df[(df['laser_state'] - dfshift['laser_state']) == 1]
        #SILLY BUG. Sometimes laserons contains incorrect timepoints. To fix this, I compare each laseron to the
        # previous and discard if ix - prev is less than the experimental looping time.
        prev = pd.DatetimeIndex([datetime.datetime(1986, 5, 27)
                                 ])[0]  #set to random time initially
        for ix, row in laserons.iterrows():
            before = ix - DateOffset(seconds=9.95)
            after = ix + DateOffset(seconds=59.95)
            if (ix - prev).total_seconds() <= 59.95:
                continue
            else:
                print "ix:", ix, "\t span:", ix - prev
                prev = ix
                dftemp = df.ix[before:after][[
                    'Genotype', 'lasergroup', 'v', 'laser_state'
                ]]
                dftemp['align'] = np.linspace(0,
                                              (after - before).total_seconds(),
                                              len(dftemp))
                df2 = pd.concat([df2, dftemp])

    expdf = df2[df2['Genotype'] == exp_genotype]
    ctrldf = df2[df2['Genotype'] == ctrl_genotype]

    #we no longer need to group by genotype, and lasergroup is always the same here
    #so just drop it.
    assert len(
        expdf['lasergroup'].unique()) == 1, "only one lasergroup handled"

    expmean = expdf.groupby(['align'], as_index=False).mean().astype(float)
    ctrlmean = ctrldf.groupby(['align'], as_index=False).mean().astype(float)

    expstd = expdf.groupby(['align'], as_index=False).mean().astype(float)
    ctrlstd = ctrldf.groupby(['align'], as_index=False).mean().astype(float)

    expn = expdf.groupby(['align'], as_index=False).count().astype(float)
    ctrln = ctrldf.groupby(['align'], as_index=False).count().astype(float)

    ####AAAAAAAARRRRRRRRRRRGGGGGGGGGGGGGGHHHHHHHHHH so much copy paste here
    df2.save(path + "/df2" + smoothstr + ".df")
    expmean.save(path + "/expmean" + smoothstr + ".df")
    ctrlmean.save(path + "/ctrlmean" + smoothstr + ".df")
    expstd.save(path + "/expstd" + smoothstr + ".df")
    ctrlstd.save(path + "/ctrlstd" + smoothstr + ".df")
    expn.save(path + "/expn" + smoothstr + ".df")
    ctrln.save(path + "/ctrln" + smoothstr + ".df")

    return expmean, ctrlmean, expstd, ctrlstd, expn, ctrln, df2
Пример #5
0
def profile_comparative(benchmarks):

    from vbench.api import BenchmarkRunner
    from vbench.db import BenchmarkDB
    from vbench.git import GitRepo
    from suite import BUILD, DB_PATH, PREPARE, dependencies
    TMP_DIR = tempfile.mkdtemp()

    try:

        prprint("Opening DB at '%s'...\n" % DB_PATH)
        db = BenchmarkDB(DB_PATH)

        prprint("Initializing Runner...")

        # all in a good cause...
        GitRepo._parse_commit_log = _parse_wrapper(args.base_commit)

        runner = BenchmarkRunner(
            benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH,
            TMP_DIR, PREPARE, always_clean=True,
            # run_option='eod', start_date=START_DATE,
            module_dependencies=dependencies)

        repo = runner.repo  # (steal the parsed git repo used by runner)
        h_head = args.target_commit or repo.shas[-1]
        h_baseline = args.base_commit

        # ARGH. reparse the repo, without discarding any commits,
        # then overwrite the previous parse results
        # prprint ("Slaughtering kittens..." )
        (repo.shas, repo.messages,
         repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH,
                                                                args.base_commit)

        prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, "")))
        prprint('Baseline [%s] : %s\n' % (h_baseline,
                repo.messages.get(h_baseline, "")))

        prprint("Removing any previous measurements for the commits.")
        db.delete_rev_results(h_baseline)
        db.delete_rev_results(h_head)

        # TODO: we could skip this, but we need to make sure all
        # results are in the DB, which is a little tricky with
        # start dates and so on.
        prprint("Running benchmarks for baseline [%s]" % h_baseline)
        runner._run_and_write_results(h_baseline)

        prprint("Running benchmarks for target [%s]" % h_head)
        runner._run_and_write_results(h_head)

        prprint('Processing results...')

        head_res = get_results_df(db, h_head)
        baseline_res = get_results_df(db, h_baseline)
        ratio = head_res['timing'] / baseline_res['timing']
        totals = DataFrame({HEAD_COL:head_res['timing'],
                                BASE_COL:baseline_res['timing'],
                                'ratio':ratio,
                                'name':baseline_res.name},
                                columns=[HEAD_COL, BASE_COL, "ratio", "name"])
        totals = totals.ix[totals[HEAD_COL] > args.min_duration]
            # ignore below threshold
        totals = totals.dropna(
        ).sort("ratio").set_index('name')  # sort in ascending order

        h_msg =   repo.messages.get(h_head, "")
        b_msg =   repo.messages.get(h_baseline, "")

        print_report(totals,h_head=h_head,h_msg=h_msg,
                     h_baseline=h_baseline,b_msg=b_msg)

        if args.outdf:
            prprint("The results DataFrame was written to '%s'\n" %  args.outdf)
            totals.save(args.outdf)
    finally:
        #        print("Disposing of TMP_DIR: %s" % TMP_DIR)
        shutil.rmtree(TMP_DIR)
Пример #6
0
def profile_comparative(benchmarks):

    from vbench.api import BenchmarkRunner
    from vbench.db import BenchmarkDB
    from vbench.git import GitRepo
    from suite import BUILD, DB_PATH, PREPARE, dependencies
    TMP_DIR = tempfile.mkdtemp()

    try:

        prprint("Opening DB at '%s'...\n" % DB_PATH)
        db = BenchmarkDB(DB_PATH)

        prprint("Initializing Runner...")

        # all in a good cause...
        GitRepo._parse_commit_log = _parse_wrapper(args.base_commit)

        runner = BenchmarkRunner(
            benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH,
            TMP_DIR, PREPARE, always_clean=True,
            # run_option='eod', start_date=START_DATE,
            module_dependencies=dependencies)

        repo = runner.repo  # (steal the parsed git repo used by runner)
        h_head = args.target_commit or repo.shas[-1]
        h_baseline = args.base_commit

        # ARGH. reparse the repo, without discarding any commits,
        # then overwrite the previous parse results
        # prprint ("Slaughtering kittens..." )
        (repo.shas, repo.messages,
         repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH,
                                                                args.base_commit)

        prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, "")))
        prprint('Baseline [%s] : %s\n' % (h_baseline,
                repo.messages.get(h_baseline, "")))

        prprint("removing any previous measurements for the commits.")
        db.delete_rev_results(h_baseline)
        db.delete_rev_results(h_head)

        # TODO: we could skip this, but we need to make sure all
        # results are in the DB, which is a little tricky with
        # start dates and so on.
        prprint("Running benchmarks for baseline [%s]" % h_baseline)
        runner._run_and_write_results(h_baseline)

        prprint("Running benchmarks for target [%s]" % h_head)
        runner._run_and_write_results(h_head)

        prprint('Processing results...')

        head_res = get_results_df(db, h_head)
        baseline_res = get_results_df(db, h_baseline)
        ratio = head_res['timing'] / baseline_res['timing']
        totals = DataFrame({HEAD_COL:head_res['timing'],
                                BASE_COL:baseline_res['timing'],
                                'ratio':ratio,
                                'name':baseline_res.name},
                                columns=[HEAD_COL, BASE_COL, "ratio", "name"])
        totals = totals.ix[totals[HEAD_COL] > args.min_duration]
            # ignore below threshold
        totals = totals.dropna(
        ).sort("ratio").set_index('name')  # sort in ascending order

        h_msg =   repo.messages.get(h_head, "")
        b_msg =   repo.messages.get(h_baseline, "")

        print_report(totals,h_head=h_head,h_msg=h_msg,
                     h_baseline=h_baseline,b_msg=b_msg)

        if args.outdf:
            prprint("The results DataFrame was written to '%s'\n" %  args.outdf)
            totals.save(args.outdf)
    finally:
        #        print("Disposing of TMP_DIR: %s" % TMP_DIR)
        shutil.rmtree(TMP_DIR)
data = [('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()
# You can pass the list of tuples to the DataFrame constructor, but you also need the column names, contained in the cursor's description attribute
print cursor.description
print DataFrame(rows, columns=zip(*cursor.description)[0])
# This is a bit of munging that you'd rather not repeat each time you query the database.  Pandas has a read_frame function in its pandas.io.sql module that simplifies the process.  Just pass the select statement and the connection object
import pandas.io.sql as sql
print sql.read_frame('select * from test', con)
# Storing and Loading Data in MongoDB
import pymongo
con = pymongo.Connection('localhost', port=27017)
# Documents stored in MongoDB are found in collections inside databases.  Each running instance of the MongoDB server can have multiple databases and each database can have multiple collections.  An example, to store the Twitter API data from earlier in the chapter.
# Access the (currently empty) tweets collection:
tweets = con.db.tweets
# Load the list of tweets and write each of them to the collection using tweets.save (writes the Python dict to MongoDB):
import requests, json
url = 'http://search.twitter.com/search.json?q=python%20pandas'
data = json.loads(requests.get(url).text)
for tweet in data['results']:
    tweets.save(tweet)
# If you wanted all my tweets from collection, you can query the collection with the following:
cursor = tweets.find({
    'from_user': '******'
})  # the cursor returned is an iterator that yields each document as a dict.
# You can convert this into a DataFrame, optionally extracting a subset of the data fields in each tweet:
tweet_fields = ['created_at', 'from_user', 'id', 'text']
result = DataFrame(list(cursor), columns=tweet_fields)
Пример #8
0
def prepare_data(path, smoothstr, smooth, exp_genotype, ctrl_genotype):

    df2 = DataFrame()
    if not os.path.exists(path + "/Speed_calculations/"):
        os.makedirs(path + "/Speed_calculations/")
    for csvfile in sorted(glob.glob(path + "/*.csv")):
        csvfilefn = os.path.basename(csvfile)
        try:
            experimentID,date,time = csvfilefn.split("_",2)
            genotype,laser,repID = experimentID.split("-",2)
            repID = repID + "_" + date
            print "processing: ", experimentID
        except:
            print "invalid filename:", csvfilefn
            continue 
        df = pd.read_csv(csvfile, index_col=0)

        if not df.index.is_unique:
            raise Exception("CORRUPT CSV. INDEX (NANOSECONDS SINCE EPOCH) MUST BE UNIQUE")


        #resample to 10ms (mean) and set a proper time index on the df
        df = flymad_analysis.fixup_index_and_resample(df, '10L')

        #smooth the positions, and recalculate the velocitys based on this.
        dt = flymad_analysis.kalman_smooth_dataframe(df, arena, smooth)

        df['laser_state'] = df['laser_state'].fillna(value=0)

        lasermask = df[df['laser_state'] == 1]
        df['tracked_t'] = df['tracked_t'] - np.min(lasermask['tracked_t'].values)

        #MAXIMUM SPEED = 300:
        df['v'][df['v'] >= 300] = np.nan
        
        #the resampling above, using the default rule of 'mean' will, if the laser
        #was on any time in that bin, increase the mean > 0.
        df['laser_state'][df['laser_state'] > 0] = 1
            
        df['Genotype'] = genotype
        df['lasergroup'] = laser
        df['RepID'] = repID

        #combine 60s trials together into df2:
        dfshift = df.shift()
        laserons = df[ (df['laser_state']-dfshift['laser_state']) == 1 ]    
        #SILLY BUG. Sometimes laserons contains incorrect timepoints. To fix this, I compare each laseron to the 
        # previous and discard if ix - prev is less than the experimental looping time.
        prev = pd.DatetimeIndex([datetime.datetime(1986, 5, 27)])[0]  #set to random time initially
        for ix,row in laserons.iterrows():
            before = ix - DateOffset(seconds=9.95)
            after = ix + DateOffset(seconds=59.95)
            if (ix - prev).total_seconds() <= 59.95:
                continue
            else:
                print "ix:", ix, "\t span:", ix - prev
                prev = ix
                dftemp = df.ix[before:after][['Genotype', 'lasergroup','v', 'laser_state']]
                dftemp['align'] = np.linspace(0,(after-before).total_seconds(),len(dftemp))
                df2 = pd.concat([df2, dftemp])

    expdf = df2[df2['Genotype'] == exp_genotype]
    ctrldf = df2[df2['Genotype']== ctrl_genotype]

    #we no longer need to group by genotype, and lasergroup is always the same here
    #so just drop it. 
    assert len(expdf['lasergroup'].unique()) == 1, "only one lasergroup handled"

    expmean = expdf.groupby(['align'], as_index=False).mean().astype(float)
    ctrlmean = ctrldf.groupby(['align'], as_index=False).mean().astype(float)

    expstd = expdf.groupby(['align'], as_index=False).mean().astype(float)
    ctrlstd = ctrldf.groupby(['align'], as_index=False).mean().astype(float)

    expn = expdf.groupby(['align'], as_index=False).count().astype(float)
    ctrln = ctrldf.groupby(['align'], as_index=False).count().astype(float)

    ####AAAAAAAARRRRRRRRRRRGGGGGGGGGGGGGGHHHHHHHHHH so much copy paste here
    df2.save(path + "/df2" + smoothstr + ".df")
    expmean.save(path + "/expmean" + smoothstr + ".df")
    ctrlmean.save(path + "/ctrlmean" + smoothstr + ".df")
    expstd.save(path + "/expstd" + smoothstr + ".df")
    ctrlstd.save(path + "/ctrlstd" + smoothstr + ".df")
    expn.save(path + "/expn" + smoothstr + ".df")
    ctrln.save(path + "/ctrln" + smoothstr + ".df")

    return expmean, ctrlmean, expstd, ctrlstd, expn, ctrln, df2