Exemplo n.º 1
0
    def __init__(self, workbench, data_path = "/home/moritz/DataBases/genomes/RefSeq/", clean = False):
        Database.__init__(self,workbench = workbench, data_path = data_path)

        if not os.path.exists(self.metadata_file) or clean:
            ftp =  FTP(ncbi)

            print "Getting metadata from ncbi"

            FNULL = open(os.devnull, 'w')
            ftp.login()
            ftp.cwd('genomes/refseq/bacteria/')
            info = StringIO.StringIO()
            ftp.retrbinary("RETR " + "assembly_summary.txt", info.write)
            info.seek(0)
            self.metadata = DataFrame.from_csv(info, sep="\t", header=1)
            ftp.close()
            self.metadata['assembly_level'] = self.metadata['assembly_level'].apply(lambda x: x.replace(" ","_"))
            self.metadata = self.metadata.transpose().to_dict()

            DataFrame.from_dict(self.metadata).to_csv(self.metadata_file)

        else :
            print "Loading metadata"
            self.metadata = DataFrame.from_csv(self.metadata_file).to_dict()

        print "Loading genomes"
        for k,v in tqdm(self.metadata.items()):
            genome_path = pjoin(self.data_path, v['assembly_level'].replace(" ","_"), k)
            genome_file = pjoin(genome_path, k + ".fna")
            self.genomes += [Genome(k, genome_path, ref=genome_file, manual_metadata = v, taxDb = self.taxDb, workbench = self.workbench)]
def testGBCLoss(db, count):
    train = db[:, 0 : count * 5]
    target = db[:, count * 5]
    testDataLoss = ["deviance", "exponential"]
    kfold = 5
    itog_val = {}
    for i in testDataLoss:
        scores = cross_validation.cross_val_score(
            GradientBoostingClassifier(
                loss=i,
                n_estimators=8,
                learning_rate=1,
                max_depth=3,
                min_samples_split=4,
                min_samples_leaf=2,
                min_weight_fraction_leaf=0,
                subsample=1,
                max_features="auto",
                random_state=3200,
            ),
            train,
            target,
            cv=kfold,
        )
        itog_val[i] = scores.mean()
    DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False)
    plt.show()
Exemplo n.º 3
0
def mash_matrix(gs, file, clean = False, proc=4):
    if os.path.exists(file) and not clean:
        pre_mat = DataFrame.from_csv(file)
        done = [g for g in gs if g.name in pre_mat.index]
        to_do = [g for g in gs if not g.name in pre_mat.index]
        if len(to_do) == 0:
            out_mat = pre_mat
        else:
            mat_small = DataFrame.from_dict({g : g.mash_compare_many(done, proc) for g in tqdm(to_do)})
            mat_small.index = Index([m.name for m in mat_small.index])
            mat_small.columns = Index([m.name for m in mat_small.columns])
            mat_small = mat_small.transpose()

            mat_big = DataFrame.from_dict({g : g.mash_compare_many(to_do + done, proc) for g in tqdm(to_do)})
            mat_big.index = Index([m.name for m in mat_big.index])
            mat_big.columns = Index([m.name for m in mat_big.columns])

            out_mat = concat([mat_big,concat([mat_small, pre_mat[mat_small.columns]], axis  = 0 ).loc[mat_big.index]], axis=1)
            out_mat = out_mat[out_mat.index]
            out_mat.to_csv(file)
    else :
        out_mat = DataFrame.from_dict({g : g.mash_compare_many(gs, proc) for g in tqdm(gs)})
        out_mat.index = Index([m.name for m in out_mat.index])
        out_mat.columns = Index([m.name for m in out_mat.columns])

        out_mat.to_csv(file)

    return out_mat.apply(lambda x : [ast.literal_eval(xx) if isinstance(xx,basestring) else xx for xx in x])
Exemplo n.º 4
0
def get_document_mapping():
    data = json.load(open('/Users/pcravich/repo/personal-agents/search/nlctaglist.json'))
    labels = list(map(lambda x: x['labels'], data))
    df = DataFrame.from_dict(labels[0], orient='index').transpose()
    for i in range(1, len(labels)):
        df = df.append(DataFrame.from_dict(labels[i], orient='index').transpose(), ignore_index=True)
    df['url'] = list(map(lambda x: x['url'], data))
    return df
def testKNNNeingh(db, count):
    train = db[:, 0 : count * 5]
    target = db[:, count * 5]
    testData = [i for i in range(1, 21, 2)]
    kfold = 5
    itog_val = {}
    for i in testData:
        scores = cross_validation.cross_val_score(KNeighborsClassifier(n_neighbors=i), train, target, cv=kfold)
        itog_val[i.__str__()] = scores.mean()
    DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False)
    plt.show()
def testKNNMetric(db, count):
    train = db[:, 0 : count * 5]
    target = db[:, count * 5]
    testData = ["euclidean", "manhattan", "chebyshev", "minkowski"]
    kfold = 5
    itog_val = {}
    for i in testData:
        scores = cross_validation.cross_val_score(
            KNeighborsClassifier(metric=i, n_neighbors=3), train, target, cv=kfold
        )
        itog_val[i] = scores.mean()
    DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False)
    plt.show()
Exemplo n.º 7
0
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({'int_col': [1, 2, 3],
                        'float_col': [1.0, 2.0, 3.0]})

        result = df.to_dict(orient='index', into=into)
        cols = ['int_col', 'float_col']
        result = DataFrame.from_dict(result, orient='index')[cols]
        expected = DataFrame.from_dict(expected, orient='index')[cols]
        tm.assert_frame_equal(result, expected)
def get_boxplot(root):
    """
    get boxplot data
    :param root: Root Server (in alphabet)
    :return:
    """
    container4 = {}
    container6 = {}

    for file in sorted(os.listdir('datasets/{}/'.format(root))):
        timestamp = int(file.split('-')[0])
        filename = 'datasets/{0}/{1}'.format(root, file)
        opened_file = DataFrame.from_csv(filename, sep='\t')
        if not opened_file.empty:
            res4 = opened_file['len4']
            container4[timestamp] = res4
            res6 = opened_file['len6']
            container6[timestamp] = res6
        else:
            container4[timestamp] = pd.Series()
            container6[timestamp] = pd.Series()

    df4 = DataFrame.from_dict(container4)
    df6 = DataFrame.from_dict(container6)

    dict4 = defaultdict()
    dict6 = defaultdict()

    #######
    # IPv4
    #######
    for ts in df4:
        dict4[ts] = {
            'name': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'),
            'type': 'box',
            'y': [int(i) for i in df4[ts].dropna()]
        }
    result4 = [dict4[i] for i in dict4]

    #######
    # IPv6
    #######
    for ts in df6:
        dict6[ts] = {
            'name': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'),
            'type': 'box',
            'y': [int(i) for i in df6[ts].dropna()]
        }
    result6 = [dict6[i] for i in dict6]

    return jsonify({'ipv4': result4, 'ipv6': result6})
Exemplo n.º 9
0
def sf_data(query):
    """
    Get opportunity data using supplied query.
    Get account data.

    Return both as dataframes.

    """

    USER = SALESFORCE['USERNAME']
    PASS = SALESFORCE['PASSWORD']
    TOKEN = SALESFORCE['TOKEN']
    HOST = SALESFORCE['HOST']

    sf = Salesforce(username=USER, password=PASS, security_token=TOKEN)

    bulk = SalesforceBulk(sessionId=sf.session_id, host=HOST)

    print "Creating Opportunity job..."
    job = bulk.create_query_job("Opportunity", contentType='CSV')
    print "Issuing query..."

    batch = bulk.query(job, query)
    while not bulk.is_batch_done(job, batch):
        print "waiting for query to complete..."
        sleep(3)
    bulk.close_job(job)

    rows = bulk.get_batch_result_iter(job, batch, parse_csv=True)
    all = list(rows)

    opps = DataFrame.from_dict(all)

    job = bulk.create_query_job("Account", contentType='CSV')
    print "Creating Account job..."

    batch = bulk.query(job,
            "SELECT Id, Website, Text_For_Donor_Wall__c FROM Account")
    print "Issuing query..."
    while not bulk.is_batch_done(job, batch):
        print "waiting for query to complete..."
        sleep(3)
    bulk.close_job(job)

    rows = bulk.get_batch_result_iter(job, batch, parse_csv=True)

    accts = DataFrame.from_dict(list(rows))
    accts.rename(columns={'Id': 'AccountId'}, inplace=True)

    return opps, accts
Exemplo n.º 10
0
def getlinks(region, vendeur):

    #initialisation des variables
    pageSuivante = True
    if vendeur=="Particulier":
        url = "http://www.leboncoin.fr/voitures/offres/"+region+"/?o=1&q=renault%20captur&it=1&f=p"
    if vendeur=="Pros":
        url = "http://www.leboncoin.fr/voitures/offres/"+region+"/?o=1&q=renault%20captur&it=1&f=c"
    tableau = pd.DataFrame()
    liens = {}
    vente = {}
    reg = {}
    i=0
    
    #récupération des liens et retour du résultat dans un DataFrame
    while pageSuivante:
             
        soup = getSoupFromUrl(url)	
        
        #utilisation d'un regex sur le lien à récupérer    
        if region == "ile_de_france":       
            balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=12\_s)$'))
        if region == "aquitaine":
            balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=2\_s)$')) 
        if region == "provence_alpes_cote_d_azur":
            balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=21\_s)$'))
        
        for lien in balises:
            liens[i]=lien.get('href')
            vente[i]=vendeur
            reg[i]=region
            i=i+1
        
        #vérification s'il y a une page suivante
        nav = soup.find_all("a", text = "Page suivante")
        
        if nav:
            url = nav[0].get('href')
        else:
            pageSuivante = False
            tableau = DataFrame.from_dict(liens,'index')
            tableau.columns = ['Lien']
            Vendeurs = DataFrame.from_dict(vente,'index')
            Vendeurs.columns = ['Vendeur']
            Regions = DataFrame.from_dict(reg,'index')
            Regions.columns = ['Region']
            tableau = pd.merge(tableau, Vendeurs, left_index=True, right_index=True)
            tableau = pd.merge(tableau, Regions, left_index=True, right_index=True)
    
    return tableau
def get_as_path_avg_length(root):
    # read this: http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    directory = '{}{}/'.format(csv_dir, root)
    result4 = {}
    result6 = {}
    for file in sorted(os.listdir(directory)):
        timestamp = int(file.split('-')[0])
        filename = '{}{}'.format(directory, file)
        opened_file = DataFrame.from_csv(filename, sep='\t')
        if not opened_file.empty:
            res4 = opened_file['len4']
            res6 = opened_file['len6']
            result4[timestamp] = res4
            result6[timestamp] = res6
        else:
            result4[timestamp] = pd.Series()
            result6[timestamp] = pd.Series()

    plot_result4 = DataFrame.from_dict(result4)
    plot_result6 = DataFrame.from_dict(result6)

    ################
    # Plot
    ################
    fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)

    plot4 = plot_result4.plot.box(figsize=(14, 5), ax=axes[0], ylim=(1.5, 9.5))
    plot6 = plot_result6.plot.box(figsize=(14, 5), ax=axes[1], ylim=(1.5, 9.5))

    n = 6

    # ticks = plot4.xaxis.get_ticklocs()
    # ticklabels = [datetime.fromtimestamp(int(l.get_text())).strftime('%d/%m/%y') for l in plot4.xaxis.get_ticklabels()]
    # plot4.xaxis.set_ticks(ticks[::n])
    # plot4.xaxis.set_ticklabels(ticklabels[::n], rotation=25)
    axes[0].text(3, 8, 'IPv4', fontsize=20, bbox={'facecolor': 'white', 'pad': 5})
    axes[1].text(3, 8, 'IPv6', fontsize=20, bbox={'facecolor': 'white', 'pad': 5})
    axes[0].grid(True)
    axes[1].grid(True)

    ticks = axes[1].xaxis.get_ticklocs()
    ticklabels = [datetime.fromtimestamp(int(l.get_text())).strftime('%d/%m/%y') for l in axes[1].xaxis.get_ticklabels()]
    axes[1].xaxis.set_ticks(ticks[::n])
    axes[1].xaxis.set_ticklabels(ticklabels[::n], rotation=25)

    plt.tight_layout()
    plt.savefig('figs/eps/path_avg_dist_{}.eps'.format(root), format='eps', dpi=1000)
    plt.savefig('figs/png/path_avg_dist_{}.png'.format(root))

    print('finish: path average {}-Root Server'.format(root))
Exemplo n.º 12
0
    def _as_dataframe(self, gene_obj, df_index=False):
        """
        converts gene object to DataFrame (pandas)
        """
        if not df_avail:
            print("Error: pandas module must be installed for as_dataframe option.")
            return

        if 'hits' in gene_obj:
            df = DataFrame.from_dict(gene_obj['hits'])
        else:
            df = DataFrame.from_dict(gene_obj)
        if df_index:
            df = df.set_index('_id')
        return df
Exemplo n.º 13
0
 def gen_data(size, seed):
     data = {
         'a': generate_uniform_float_column(size, 0., 1., seed + 1),
         'b': generate_uniform_float_column(size, 0., 1., seed + 2),
         'c': generate_uniform_float_column(size, 0., 1., seed + 3)
     }
     return DataFrame.from_dict(data)
Exemplo n.º 14
0
def collector2table(collector):
    """
    collector2table return a station table as a DataFrame.
    columns are station, sensor, lon, lat, and the index is the station
    number.

    This is a substitute for `sos_request`.

    """
    # This accepts only 1-day request, but since we only want the
    # stations available we try again with end=start.
    c = copy.copy(collector)
    try:
        response = c.raw(responseFormat="text/csv")
    except ExceptionReport:
        response = c.filter(end=c.start_time).raw(responseFormat="text/csv")
    df = read_csv(BytesIO(response.encode('utf-8')),
                  parse_dates=True)
    columns = {'sensor_id': 'sensor',
               'station_id': 'station',
               'latitude (degree)': 'lat',
               'longitude (degree)': 'lon'}
    df.rename(columns=columns, inplace=True)
    df['sensor'] = [s.split(':')[-1] for s in df['sensor']]
    df['station'] = [s.split(':')[-1] for s in df['station']]

    df = df[['station', 'sensor', 'lon', 'lat']]
    g = df.groupby('station')
    df = dict()
    for station in g.groups.keys():
        df.update({station: g.get_group(station).iloc[0]})
    return DataFrame.from_dict(df).T
Exemplo n.º 15
0
def apply_skill(dfs, function, remove_mean=True, filter_tides=False):
    skills = dict()
    for station, df in dfs.iteritems():
        if filter_tides:
            df = df.apply(low_pass)
        skill = dict()
        obs = df.pop('OBS_DATA')
        if obs.isnull().all():
            # No observations.
            skills.update({station: np.NaN})
            continue
        for model, y in df.iteritems():
            # No models.
            if y.isnull().all():
                skills.update({station: np.NaN})
                continue
            mask = both_valid(obs, y)
            x, y = obs[mask], y[mask]
            if remove_mean:
                x, y = x-x.mean(), y-y.mean()
            if x.size:
                ret = function(x, y)
            else:
                ret = np.NaN
            skill.update({model: ret})
        skills.update({station: skill})
    return DataFrame.from_dict(skills)
Exemplo n.º 16
0
def test_csv_read_files():
    df = DataFrame.from_dict({0:['01',2], 1:['x', 12]}, orient='index')
    df.columns = ['a','b']
    
    df2 = csv_read_files(tempfilename, index_col=0)
    assert_frame_equal(df, df2)
    
Exemplo n.º 17
0
def compare_assemblies(assemblies, chunk_size = 2000, identity_threshold = 0.40):
    """
    compares a set of assemblies:
    assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values
    """
    similarities = {}


    print "make blast dbs"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_cmd = ["makeblastdb" ,"-in", subject, "-dbtype", "nucl", "-out", subject]
        with open("/dev/null") as null:
            blastdb_return = call(blast_db_cmd, stdout=null)

    print "Run the hell out of it"
    for scaff_name, scaff in tqdm(assemblies.iteritems()):
        similarities[scaff_name] = {}
        chopped_up_query = "tmp.fasta"
        nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size))
        for subject_name, subject in assemblies.iteritems():
            nics = find_NICs(chopped_up_query, subject, identity_threshold, blast_db = False)
#            print scaff_name, "vs", subject_name
            similarities[scaff_name][subject_name] = len(nics.keys())/nb_chunks
    os.remove(chopped_up_query)

    print "clean up"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_files = [subject + ".nhr", subject + ".nin",  subject + ".nsq"]
        for f in blast_db_files:
            os.remove(f)


    similars =  DataFrame.from_dict(similarities)
    return similars
Exemplo n.º 18
0
    def dataFrame(self):
        from pandas import DataFrame

        items = self.execute()
        if len(items) == 0:
            return DataFrame()
        return DataFrame.from_dict(items)
Exemplo n.º 19
0
def Get_Test_Data_YQL():
    # Will have to change conver_objects to specific numeric calls in the future
    result = load(urlopen("https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.historicaldata%20where%20symbol%20%3D%20%22YHOO%22%20and%20startDate%20%3D%20%222010-01-11%22%20and%20endDate%20%3D%20%222010-05-10%22&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback="))
    x = DataFrame.from_dict(result['query']['results']['quote'])
    x["Date"] = to_datetime(x["Date"])
    x = x.convert_objects(convert_numeric=True)
    return x
def formatChecker(filename):
  reader = csv.DictReader(open(filename))
  result = {}
  key = -1
  for row in reader:
    key += 1
    # key = row.pop('url')
    if key in result:
      pass
    if row['longitude'] is '' or row['latitude'] is '':
      continue
    row['longitude'] = float(row['longitude'])
    row['latitude'] = float(row['latitude'])
    result[key] = row

  for k, v, in result.iteritems():
    if 'location' not in v:
      raise NameError('Missing ["location"] header')
    if 'day' not in v or 'month' not in v or 'year' not in v:
      raise NameError('Missing ["day"], ["month"], or ["year"] header')
    tempDate = validDateToJulianDate(v['month'] +'-'+ v['day'] +'-'+ v['year'])
    result[k].update({'concatDate':tempDate[0]})
    result[k].update({'julianDay':tempDate[1]})
    result[k].update({'julianDate':tempDate[2]})
  df = DataFrame.from_dict(result, orient='index', dtype=None)
  if 'latitude' not in df.columns and 'longitude' not in df.columns:
    raise NameError('Missing ["latitude"], or ["longitude"] or header')
  # df = df.convert_objects(convert_numeric=True).dtypes
  # df[['latitude', 'longitude']] = df[['latitude', 'longitude']].astype(float)

  # except:
  #   sys.exit('Date field contains non-digits.')
  return df
Exemplo n.º 21
0
def _project(dataframe, project_q):
    if not project_q:
        return dataframe

    assert_list("project", project_q)

    if project_q == [["count"]]:
        # Special case for count only, ~equal to SQL count(*)
        return DataFrame.from_dict({"count": [len(dataframe)]})

    aggregate_fns, alias_expressions = classify_expressions(project_q)

    if aggregate_fns and alias_expressions:
        raise_malformed("Cannot mix aliasing and aggregation functions", project_q)

    if isinstance(dataframe, DataFrameGroupBy):
        dataframe = _aggregate(dataframe, project_q, aggregate_fns)
    elif aggregate_fns:
        return _aggregate_without_group_by(dataframe, project_q, aggregate_fns)
    elif alias_expressions:
        dataframe = _alias(dataframe, alias_expressions)
    else:
        # Nothing to do here
        pass

    columns = [e if type(e) is not list else e[1] for e in project_q]

    try:
        return dataframe[columns]
    except KeyError:
        missing_columns = set(columns) - set(dataframe.columns.values)
        raise_malformed("Selected columns not in table", list(missing_columns))
Exemplo n.º 22
0
 def make_cluster_bmft(self):
     cluster_table = DataFrame.from_dict({i : {k : len(v) for k,v in c.to_dict()['genes'].iteritems()} for i,c in enumerate(self)}, orient='index')
     cluster_table = cluster_table.apply(nan_to_num)
     cluster_table['annotations'] = [c.annotation for c in self]
     cluster_table['qual_annot'] = [c.annot_fraction for c in self]
     cluster_table['genes'] = [";".join(c.genes) for c in self]
     return cluster_table
Exemplo n.º 23
0
def cross_validation_test():
    data = get_analyze_data()
    target = data["hand"]
    train = data.drop(["id"], axis = 1)
    kfold = 5
    cross_val_test = {}

    print "Cross validation test..."
    model_rfc = RandomForestClassifier(n_estimators = 100)
    model_knc = KNeighborsClassifier(n_neighbors = 15)
    model_lr = LogisticRegression(penalty='l1', tol=0.01)

    scores = cross_validation.cross_val_score(model_rfc, train, target, cv = kfold)
    cross_val_test['RFC'] = scores.mean()

    scores = cross_validation.cross_val_score(model_knc, train, target, cv = kfold)
    cross_val_test['KNC'] = scores.mean()

    scores = cross_validation.cross_val_score(model_lr, train, target, cv = kfold)
    cross_val_test['LR'] = scores.mean()

    f = plt.figure(figsize = (8, 6))
    p = DataFrame.from_dict(data = cross_val_test, orient='index').plot(kind='barh', legend=False, ax = f.gca())
    f.savefig('./%s/cross_validation_test.png' % dirs[1])

    for k,v in cross_val_test.iteritems():
        print "%s : %s" % (k,str(v))
def cross_validation_test():
    data = get_train_data()
    target = data.Cover_Type
    train = data.drop(['Cover_Type'], axis = 1)
    kfold = 10
    cross_val_final = {}

    print 'Cross validation test...'
    model_rfc = RandomForestClassifier(n_estimators = 1024, criterion='entropy', n_jobs = -1)
    model_knc = KNeighborsClassifier(n_neighbors = 128)
    model_lr = LogisticRegression(penalty='l1', C=1e5)

    scores = cross_validation.cross_val_score(model_rfc, train, target, cv = kfold)
    cross_val_final['RFC'] = scores.mean()
    print 'RFC: ', scores.mean()

    scores = cross_validation.cross_val_score(model_knc, train, target, cv = kfold)
    cross_val_final['KNC'] = scores.mean()
    print 'KNC: ', scores.mean()


    scores = cross_validation.cross_val_score(model_lr, train, target, cv = kfold)
    cross_val_final['LR'] = scores.mean()
    print 'LR: ', scores.mean()

    f = plt.figure(figsize = (8, 6))
    p = DataFrame.from_dict(data = cross_val_final, orient='index').plot(kind='barh', legend=False, ax = f.gca())
    f.savefig('./test_plot/cross_validation_rfc_1024.png')
Exemplo n.º 25
0
def read_umi_tools(filename: PathLike) -> AnnData:
    """Read a gzipped condensed count matrix from umi_tools.

    Parameters
    ----------
    filename
        File name to read from.
    """
    # import pandas for conversion of a dict of dicts into a matrix
    # import gzip to read a gzipped file :-)
    import gzip
    from pandas import DataFrame

    dod = {}  # this will contain basically everything
    fh = gzip.open(fspath(filename))
    header = fh.readline()  # read the first line

    for line in fh:
        t = line.decode('ascii').split('\t')  # gzip read bytes, hence the decoding
        try:
            dod[t[1]].update({t[0]:int(t[2])})
        except KeyError:
            dod[t[1]] = {t[0]:int(t[2])}

    df = DataFrame.from_dict(dod, orient='index')  # build the matrix
    df.fillna(value=0., inplace=True)  # many NaN, replace with zeros
    return AnnData(np.array(df), {'obs_names': df.index}, {'var_names': df.columns})
Exemplo n.º 26
0
    def _collect_requests(query, request_limit):
        """Collects the string-casted results of a query.

        Args:
            query: (aflow.control.Query) A query with unprocessed requests.
            request_limit: (int) Maximum number of requests to submit.

        Returns: (DataFrame) Results collected from the query.
        """

        # requests the first page of results to determine number of pages
        query._request(1, query.k)
        page_limit = (query._N // query.k) + 1
        if request_limit and (page_limit > request_limit):
            page_limit = request_limit

        # requests the remaining pages
        for page in range(2, page_limit + 1):
            query._request(page, query.k)

        # collects request responses
        records = {}
        for page in range(1, page_limit + 1):
            records.update(query.responses[page])
        return DataFrame.from_dict(data=records, orient='index')
def creat_table_base(records):
    # saved caculated variable names and descriptions in json format
    # currently only includes 16 most used variables
    calculated_vars = {"_iitax": "Federal income tax liability",
                       "_fica": "FICA taxes  (ee+er) for OASDI+HI",
                       "c00100": "Federal AGI",
                       "c02500": "OASDI benefits in AGI",
                       "c04600": "Post-phase-out personal exemption",
                       "_prexmp": "Pre-phase-out personal exemption",
                       "c21040": "Itemized deduction that is phased out",
                       "c04470": "Post-phase-out itemized deduction",
                       "c04800": "Federal regular taxable income",
                       "c05200": "Regular tax on taxable income",
                       "c07220": "Child tax credit (adjusted)",
                       "c11070": "Extra child tax credit (refunded)",
                       "c07180": "Child care credit",
                       "_eitc": "Federal EITC",
                       "c62100_everyone": "federal AMT taxable income",
                       "c09600": "federal AMT liability"}

    cal = DataFrame.from_dict(calculated_vars, orient='index')
    cal.columns = ['description']

    puf_ecodes_info = pd.read_csv(EVAR_PATH)

    # Use all variable list minus unused variable list
    # to get used variable list
    VALID_READ_VARS = records.VALID_READ_VARS

    CODES_IMP = set(['AGIR1', 'DSI', 'EFI', 'EIC', 'ELECT', 'FDED',
                     'FLPDYR', 'FLPDMO', 'f2441', 'f3800', 'f6251',
                     'f8582', 'f8606', 'f8829', 'f8910', 'f8936', 'n20',
                     'n24', 'n25', 'n30', 'PREP', 'SCHB', 'SCHCF', 'SCHE',
                     'TFORM', 'IE', 'TXST', 'XFPT', 'XFST', 'XOCAH',
                     'XOCAWH', 'XOODEP', 'XOPAR', 'XTOT', 'MARS', 'MIDR',
                     'RECID', 'gender', 'wage_head', 'wage_spouse',
                     'earnsplit', 'age', 'agedp1', 'agedp2', 'agedp3',
                     'AGERANGE', 's006', 's008', 's009', 'WSAMP', 'TXRT',
                     'filer', 'matched_weight', 'e00200p', 'e00200s',
                     'e00900p', 'e00900s', 'e02100p', 'e02100s'])

    UNUSED_READ_VARS = records.UNUSED_READ_VARS

    USED_VARS = list(VALID_READ_VARS - CODES_IMP - UNUSED_READ_VARS)

    # read variable description from e_variable_info.csv
    table = {}
    for i in range(0, len(USED_VARS) - 1):
        # use variable names as keys of dictionary
        var_name = USED_VARS[i]
        f = (puf_ecodes_info.Input_Name == var_name)
        description = puf_ecodes_info.Definition_2014[f].values[0]
        table[var_name] = description

    table = pd.DataFrame.from_dict(table, orient='index')
    table.columns = ["description"]

    table = table.append(cal)
    return table
Exemplo n.º 28
0
def pct_students_first_choice(to_compare):
	results = []
	for i in to_compare:
		results.append(float(sum([1 if len(s.assigned) > 0 and (s.preference[0] in s.assigned) else 0 for s in si_students[i]])) / float(nstudents))
	results = map(lambda x: x * 100, results)

	df = DataFrame.from_dict({'mechanism': sublist(mechanisms, to_compare), 'pct_students': results})
	return (results, bar_graph(df, "% Students Matched With Top Choice\n"))
Exemplo n.º 29
0
    def test_get_dummies_dont_sparsify_all_columns(self, sparse):
        # GH18914
        df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
                                              ('Nation', ['AB', 'CD'])]))
        df = get_dummies(df, columns=['Nation'], sparse=sparse)
        df2 = df.reindex(columns=['GDP'])

        tm.assert_frame_equal(df[['GDP']], df2)
Exemplo n.º 30
0
 def calculate(self):
     self.data = None
     cursor = connection.cursor()
     cursor.execute(self.get_query(),
                    dict(year=self.parameters.registry_year,
                         period=self.parameters.registry_period))
     self.data = DataFrame.from_dict(self.prepare_data(dictfetchall(cursor)), orient='columns')
     cursor.close()
Exemplo n.º 31
0
def df(country, f=None, **kwargs):
    date = kwargs['date'] if 'date' in kwargs else None
    response = search(country, f=f, date=date)

    if date is None:
        obj = {}
        obj[30] = {}
        for ur in response.aggs.ur30.buckets:
            obj[30][round(ur.key, 2)] = ur.doc_count
        obj[60] = {}
        for ur in response.aggs.ur60.buckets:
            obj[60][round(ur.key, 2)] = ur.doc_count
        obj[90] = {}
        for ur in response.aggs.ur90.buckets:
            obj[90][round(ur.key, 2)] = ur.doc_count
        obj[180] = {}
        for ur in response.aggs.ur180.buckets:
            obj[180][round(ur.key, 2)] = ur.doc_count
        obj[360] = {}
        for ur in response.aggs.ur360.buckets:
            obj[360][round(ur.key, 2)] = ur.doc_count

    else:
        obj = {}
        obj[30] = {}
        for ur in response.aggs.ur30.buckets:
            obj[30][round(ur.key, 2)] = ur.doc_count

    df = DataFrame.from_dict(obj).T

    if df.empty:
        return df

    df.index.name = 'period'
    df = df.fillna(0).astype('int64')
    df['total'] = df.sum(axis=1)
    df = df.reset_index()

    return df
Exemplo n.º 32
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        cases_and_deaths = {
            "date": "date",
            "areaCode": "areaCode",
            "newCasesByPublishDate": "newCasesByPublishDate",
            "cumCasesByPublishDate": "cumCasesByPublishDate",
            "newDeaths28DaysByPublishDate": "newDeaths28DaysByPublishDate",
            "cumDeaths28DaysByPublishDate": "cumDeaths28DaysByPublishDate",
            "cumPillarOneTestsByPublishDate": "cumPillarOneTestsByPublishDate",
        }
        api = Cov19API(filters=["areaType=overview"],
                       structure=cases_and_deaths)
        data_json = api.get_json()
        data = DataFrame.from_dict(data_json["data"])

        data = table_rename(
            data,
            {
                "areaCode": "country_code",
                "newCasesByPublishDate": "new_confirmed",
                "cumCasesByPublishDate": "total_confirmed",
                "newDeaths28DaysByPublishDate": "new_deceased",
                "cumDeaths28DaysByDeathDate": "total_deceased",
                "cumPillarOneTestsByPublishDate": "total_tested",
                "date": "date",
            },
            drop=True,
        )

        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))

        # Make sure all records have country code and no subregion code
        data["key"] = "GB"
        data["country_code"] = "GB"
        data["subregion2_code"] = None

        return data
Exemplo n.º 33
0
def df(country, f=None, **kwargs):
    today = dt.datetime.today().strftime('%Y-%m-%d')
    date = kwargs['date'] if 'date' in kwargs else today
    response = search(country, f=f, date=date)

    obj = {}
    for model in response.aggs.model.buckets:
        obj[model.key] = {
            'taza 30': round(model.ur30.value, 4),
            'total': model.doc_count
        }

    df = DataFrame.from_dict(obj).T

    if df.empty:
        return df

    df.index.name = 'modelo'
    df = df[['taza 30', 'total']]
    df = df.reset_index()

    return df
Exemplo n.º 34
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        # Regions have case data on a "by specimen date" basis.
        # This means they don't add up to the counts for nation which are on a
        # "by publish date" basis.
        cases_and_deaths = {
            "date": "date",
            "areaCode": "areaCode",
            "newCasesBySpecimenDate": "newCasesBySpecimenDate",
            "cumCasesBySpecimenDate": "cumCasesBySpecimenDate",
            "newDeaths28DaysByDeathDate": "newDeaths28DaysByDeathDate",
            "cumDeaths28DaysByDeathDate": "cumDeaths28DaysByDeathDate",
        }

        api = Cov19API(filters=["areaType=region"], structure=cases_and_deaths)
        regions_json = api.get_json()
        data = DataFrame.from_dict(regions_json["data"])
        data = table_rename(
            data,
            {
                "areaCode": "match_string",
                "newCasesBySpecimenDate": "new_confirmed",
                "cumCasesBySpecimenDate": "total_confirmed",
                "newDeaths28DaysByDeathDate": "new_deceased",
                "cumDeaths28DaysByDeathDate": "total_deceased",
                "date": "date",
            },
            drop=True,
        )

        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))
        _fix_bad_total_deceased(data)

        # Make sure all records have country code and no subregion code
        data["country_code"] = "GB"
        data["subregion2_code"] = None

        return data
Exemplo n.º 35
0
    def _post_process(self) -> DataFrame:
        columns = self.columns
        results = defaultdict(list)
        for row in self.results:
            for column in row:
                column_value = column.value
                if column_value.success is True:
                    column_data = column_value.data
                    if isinstance(column_data, Number):
                        format_: ColumnFormat = columns[
                            column.column_index].format_
                        results[column.name].append(
                            round(column_data, format_.precision))
                    else:
                        results[column.name].append(column_data)
                else:
                    results[column.name].append(np.NaN)

        df = DataFrame.from_dict(results)
        df = self.__handle_filters(df)
        df = self.__handle_sorts(df)
        return df
Exemplo n.º 36
0
def calcsupp(df, filePath):
    '''
    计算供应商的vltcv、满足率(actual_origin_rate)、相对满足率
    :param df:
    :param filePath:
    :return:
    '''
    # df = data; filePath = analysis_path
    grouped = df.groupby('supp_name')
    summary = {}
    for supp_name, group in grouped:
        # item_sku_id = grouped.groups.keys()[1]
        # group = grouped.get_group(item_sku_id)
        sample = group[group.pur_bill_id.isnull() == False]
        for i in sample.index:
            # i = sample.index[0]
            vlt = sample.vlt[i]
            # supp_brevity_cd:供应商简码
            supp_brevity_cd = sample.supp_brevity_cd[i]
            if np.isnan(vlt):
                # 使用 vlt 的均值来填充
                vlt = np.nanmean(df.vlt)  # 有采购单而vlt为空的数目很少
            vlt = int(vlt)
            if vlt > 28:
                # 大于28天的 vlt 限制成28天
                vlt = 28
            actual_plan_rate = sample.actual_pur_qtty[i] / float(sample.plan_pur_qtty[i])  # 这个计算出来为空的,应该是订单无效?
            # actual_origin_rate = 实际/原始
            actual_origin_rate = sample.actual_pur_qtty[i] / float(sample.originalnum[i])
            summary[i] = {'supp_name': supp_name, 'day_string': sample.day_string[i],
                          'band': sample.org_nation_sale_num_band[i], 'item_sku_id': sample.item_sku_id[i],
                          'vlt': vlt, 'supp_brevity_cd': supp_brevity_cd,
                          'actual_pur_qtty': sample.actual_pur_qtty[i],'plan_pur_qtty': sample.plan_pur_qtty[i],
                          'originalnum': sample.originalnum[i],
                          'pur_bill_id': sample.pur_bill_id[i],
                          'actual_plan_rate': actual_plan_rate, 'actual_origin_rate': actual_origin_rate}
    z_value_frame = DataFrame.from_dict(summary).T
    z_value_frame.to_csv(filePath + '\\supp_value_frame.csv', index=False)
    return z_value_frame
Exemplo n.º 37
0
    def process(self, num_cores=10):
        to_dl = []
        for k, v in tqdm(self.metadata.items()):
            genome_path = pjoin(self.data_path,
                                v['assembly_level'].replace(" ", "_"), k)
            genome_file = pjoin(genome_path, k + ".fna")
            if not os.path.exists(genome_path):
                os.makedirs(genome_path)
            if not os.path.exists(genome_file) or (not os.path.exists(
                    genome_file.replace(".fna", ".faa"))):
                to_dl += [(v['ftp_path'], genome_path, genome_file)]
        dlstuff = Parallel(n_jobs=num_cores)(delayed(download)(i)
                                             for i in tqdm(to_dl))

        Database.process(self)

        self.taxos = DataFrame.from_dict({
            g.name: g.get_taxo(self.taxDb)
            for g in self.genomes
            if g.metadata['assembly_level'] == "Complete_Genome"
        }).transpose().to_csv(
            pjoin(self.metadata_path, "complete_genomes_pretty_taxo.csv"))
Exemplo n.º 38
0
def addToDB(
    cnx, info
):  #Adds all songs, attributes, and artists to the appropriate tables in the database
    stdout.write("Adding to database...\n")
    stdout.flush()
    frame = DataFrame.from_dict(
        info
    )  # Convert the list data structure from getLikedSongs to a dataframe for ease of use
    count = 0  # Track progress
    for index, song in frame.iterrows():
        insertArtist(song["artist"], cnx)  # Put artist in the artists table
        songTuple = (song["id"], song["track"])  # Info for song table
        insertSong(songTuple, song["artist"], cnx)  # Put song in songs table
        insertSHA(song["artist"], song["id"],
                  cnx)  # Link the artist and song in database
        # Convert all 0's to major and 1's to minor to match database
        if song['mode'] == 0:
            song['mode'] = 'major'
        elif song['mode'] == 1:
            song['mode'] = 'minor'
        # Get all attributes for the song in one place
        attributeTuple = (round(song["acousticness"],
                                9), round(song["danceability"],
                                          9), song["duration_ms"],
                          round(song["energy"],
                                9), song["instrumentalness"], song["key"],
                          round(song["liveness"],
                                9), round(song["loudness"], 9), song["mode"],
                          round(song["speechiness"],
                                9), round(song["tempo"],
                                          4), round(song["valence"], 9))
        insertAttributes(attributeTuple, song["id"],
                         cnx)  # Put song attributes in the tracks table

        # Keep track of progress
        count += 1
        if count % 50 == 0:
            stdout.write(str(count) + '\n')
            stdout.flush()
Exemplo n.º 39
0
    def test_int64_overflow_moar(self):

        # GH9096
        values = range(55109)
        data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values})
        grouped = data.groupby(["a", "b", "c", "d"])
        assert len(grouped) == len(values)

        arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
        i = np.random.choice(len(arr), len(arr) * 4)
        arr = np.vstack((arr, arr[i]))  # add sume duplicate rows

        i = np.random.permutation(len(arr))
        arr = arr[i]  # shuffle rows

        df = DataFrame(arr, columns=list("abcde"))
        df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10
        gr = df.groupby(list("abcde"))

        # verify this is testing what it is supposed to test!
        assert is_int64_overflow_possible(gr.grouper.shape)

        # manually compute groupings
        jim, joe = defaultdict(list), defaultdict(list)
        for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]):
            jim[key].append(a)
            joe[key].append(b)

        assert len(gr) == len(jim)
        mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde"))

        def aggr(func):
            f = lambda a: np.fromiter(map(func, a), dtype="f8")
            arr = np.vstack((f(jim.values()), f(joe.values()))).T
            res = DataFrame(arr, columns=["jim", "joe"], index=mi)
            return res.sort_index()

        tm.assert_frame_equal(gr.mean(), aggr(np.mean))
        tm.assert_frame_equal(gr.median(), aggr(np.median))
Exemplo n.º 40
0
def test_get_ingress_results(serializer, options, tx_port, rx_port, api):
    """UDP Flow test traffic configuration
    """
    udp_endpoint = PortTxRx(tx_port_name=tx_port.name,
                            rx_port_names=[rx_port.name])
    udp_header = Udp(src_port=Pattern(Counter(start="12001",
                                              step="2",
                                              count=100),
                                      ingress_result_name='UDP SRC PORT'),
                     dst_port=Pattern("20"))
    udp_flow = Flow(name='UDP Flow',
                    tx_rx=TxRx(udp_endpoint),
                    packet=[
                        Header(Ethernet()),
                        Header(Vlan()),
                        Header(Ipv4()),
                        Header(udp_header)
                    ],
                    size=Size(128),
                    rate=Rate(unit='pps', value=1000),
                    duration=Duration(FixedPackets(packets=10000)))
    config = Config(ports=[tx_port, rx_port],
                    flows=[udp_flow],
                    options=options)
    state = State(ConfigState(config=config, state='set'))
    print(serializer.json(state))
    api.set_state(state)
    state = State(FlowTransmitState(state='start'))
    api.set_state(state)

    from pandas import DataFrame
    request = FlowRequest(ingress_result_names=['UDP SRC PORT'])
    while True:
        results = api.get_flow_results(request)
        df = DataFrame.from_dict(results)
        print(df)
        if df.frames_tx.sum() >= 10000 and df.frames_tx_rate.sum() == 0:
            break
def get_happinesses_by_method(pop_iterator, fast=False):

    num_sim, current_sim = 1500, 0
    utils_by_scf = Dict()
    dataframe_dict = Dict()
    test_num_candidates = [3, 4, 6, 9, 13, 18, 24]

    # modify each sim to run in parallel
    while current_sim < num_sim:
        print(current_sim)
        # simulate for various numbers of candidates
        for n_candidates in test_num_candidates:
            n_voters = n_candidates * 750

            for pop, param in pop_iterator(n_voters, n_candidates):
                n_pref_by_rk, pref_ij = ls.fast_gen_pref_summ(pop.preferences_rk)
                weights = ls.get_weights_from_counts(n_pref_by_rk)
                utils = social_util_by_cand(weights)
                winners_by_scf = simulate_all_elections(pop, fast=fast,
                    n_pref_by_rank=n_pref_by_rk, pref_i_to_j=pref_ij)

                utils_by_scf[param][n_candidates][current_sim] = \
                    {k: utils[v] for k, v in winners_by_scf.items()}
        current_sim += 1
    save_directory = 'Population_type_sim=' + pop_iterator.__name__
    archive_old_sims(save_directory, 'Previous_sims_all_methods')
    # utils_by_scf[pop_param][n_candidates][sim_number][scf]
    # now make dict of DataFrames by paramaters, n_candidates
    os.mkdir(save_directory)
    for param, v_upper in utils_by_scf.items():
        for n_cand, scf_by_sim_num in v_upper.items():
            dataframe_dict[param][n_cand] = DataFrame.from_dict(scf_by_sim_num,
                                                                orient='index')
            dataframe_dict[param][n_cand].boxplot(rot=90)  # labels? by axis?
            plt.tight_layout()
            plt.savefig(save_directory + '/plot_p=' + str(param) +
                        '_n_cand=' + str(n_cand) + '.png')
            plt.close()
Exemplo n.º 42
0
def getData(forDays, dropExtraCols=True):

    # subtract forDays in terms of seconds from now
    now = int(time.time()) * 1000
    timeToSubtract = int(forDays * 24 * 60 * 60 * 1000)
    effective_time = now - timeToSubtract
    print("now:", now)
    print("timeToSubtract:", timeToSubtract)
    print("effective_time :", effective_time)

    mongoClient = mongo.MongoClient(os.environ['mongodbhost'], 27017)
    db = mongoClient.admin
    sensorDataCollection = db.sensors
    sensorData = sensorDataCollection.find({
        "source": "composite",
        "unixtime": {
            "$gte": effective_time
        }
    }).sort([("timestamp", mongo.DESCENDING)])

    #temporary to hold data
    sensorDataArray = []

    #populate temp
    for dataRow in sensorData:
        if len(dataRow) is 17:
            sensorDataArray.append(dataRow)
        else:
            print("bad data", dataRow)

    #create df
    df = DataFrame.from_dict(sensorDataArray)

    #drop unnecessary cols
    if dropExtraCols:
        df = df.drop(['_id', 'timestamp', 'unixtime', 'source'], axis=1)

    return df
Exemplo n.º 43
0
def compare_assemblies(assemblies, chunk_size=2000, identity_threshold=0.40):
    """
    compares a set of assemblies:
    assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values
    """
    similarities = {}

    print "make blast dbs"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_cmd = [
            "makeblastdb", "-in", subject, "-dbtype", "nucl", "-out", subject
        ]
        with open("/dev/null") as null:
            blastdb_return = call(blast_db_cmd, stdout=null)

    print "Run the hell out of it"
    for scaff_name, scaff in tqdm(assemblies.iteritems()):
        similarities[scaff_name] = {}
        chopped_up_query = "tmp.fasta"
        nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size))
        for subject_name, subject in assemblies.iteritems():
            nics = find_NICs(chopped_up_query,
                             subject,
                             identity_threshold,
                             blast_db=False)
            #            print scaff_name, "vs", subject_name
            similarities[scaff_name][subject_name] = len(
                nics.keys()) / nb_chunks
    os.remove(chopped_up_query)

    print "clean up"
    for subject_name, subject in tqdm(assemblies.iteritems()):
        blast_db_files = [subject + ".nhr", subject + ".nin", subject + ".nsq"]
        for f in blast_db_files:
            os.remove(f)

    similars = DataFrame.from_dict(similarities)
    return similars
Exemplo n.º 44
0
def generateChart(algo, traderData):
    try:
        chart = DataFrame.from_dict(traderData['candles'])
        stock = StockDataFrame.retype(chart)
        value = CCI(chart['close'], chart['high'], chart['low'], 4, 0.0109)[-1]

        if (time.clock() - traderData['startTime'] > traderData['time']):
            traderData['buyingEnabled'] = False
        if algo == "cci":
            print(traderData['tradingSymbol'], value,
                  traderData['candles'][-1]['close'],
                  traderData['candles'][-1]['date'])
            cci_value = value
            if cci_value < -100 and traderData[
                    'bought'] == False and traderData['updating'] == False:
                traderData['waiting'] = True
            if cci_value > -100 and traderData[
                    'bought'] == False and traderData[
                        'waiting'] and traderData['buyingEnabled']:
                buy(traderData)
                traderData['waiting'] = False
            elif cci_value > 100 and traderData['bought']:
                sell(traderData)

        if algo == "macd":
            macd_value = stock['macdh'][-1]
            print(macd_value)
            if macd_value > 0 and traderData['bought'] == False:
                buy(traderData)
            elif macd_value < 0 and traderData['bought']:
                sell(traderData)
    #get cci value for current time
    #if greater than 100 and not bought in yet, buy
    #if less than 100(NOT -100) and bought in, sell
    except:
        print("Unexpected error, trying again")
        time.sleep(.3)
        generateChart(algo, traderData)
Exemplo n.º 45
0
def df_weighted(country, start=None, end=None, f=None, interval='month'):
    response = search_weighted(country,
                               start=start,
                               end=end,
                               f=f,
                               interval=interval)

    dates = [
        x.key_as_string
        for x in response.aggs.stats.date_filter.kingo.dates.buckets
    ]
    obj = {x: {0} for x in dates}

    for date in response.aggs.stats.date_filter.kingo.dates.buckets:
        obj[date.key_as_string] = {}
        for model_version in date.model_version.buckets:
            obj[date.key_as_string][
                model_version.key] = model_version.doc_count

    df = DataFrame.from_dict(obj, orient='index', dtype='int64')

    if df.empty:
        return df

    df.index.name = 'date'
    df = df.reindex(df.index.astype('datetime64')).sort_index()
    df = df.fillna(0).astype('int64')
    bucket_len = [x.days for x in diff(df.index.tolist())]

    if end is not None:
        bucket_len.append((date_dt(end) - df.index[-1].date()).days)
    else:
        bucket_len.append((local_date_dt(country) - df.index[-1].date()).days)

    df = df.div(bucket_len, axis='index')
    df['total'] = df.sum(axis=1)

    return df.astype('int64')
Exemplo n.º 46
0
def get_report(session, args):

    # Arguments
    start, end = parse_date_range(args['report_range'])

    exists = report_exists(session, start, end, 'sla_report')

    if exists:
        # No work to do
        success = True
    else:
        # Celery worker
        task_results = report_task.delay(start, end)
        task_results.wait()
        success = task_results.get(timeout=1)

    if success:
        cached_results = get_cached_report(session, start, end, 'sla_report')

        frame = DataFrame.from_dict(
            cached_results.report
        )

        frame.name = '{rpt} {frm} {to}'.format(
            rpt=cached_results.name,
            frm=start,
            to=end
        )

        # Convert pyexcel table into dataframe
        index = Series(['{ext} {name}'.format(ext=client_ext, name=client_info['CLIENT_NAME'])
                        for client_ext, client_info in current_app.config['CLIENTS'].items()] + ['Summary'])
        frame.insert(0, "Client", index)
        total = len(index)
    else:
        frame, total = empty_frame()

    return frame, total
Exemplo n.º 47
0
    def test_loc_modify_datetime(self):
        # see gh-28837
        df = DataFrame.from_dict({
            "date":
            [1485264372711, 1485265925110, 1540215845888, 1540282121025]
        })

        df["date_dt"] = pd.to_datetime(df["date"], unit="ms", cache=True)

        df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"]
        df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"]

        expected = DataFrame(
            [
                [
                    1485264372711, "2017-01-24 13:26:12.711",
                    "2017-01-24 13:26:12.711"
                ],
                [
                    1485265925110, "2017-01-24 13:52:05.110",
                    "2017-01-24 13:52:05.110"
                ],
                [
                    1540215845888, "2018-10-22 13:44:05.888",
                    "2018-10-22 13:44:05.888"
                ],
                [
                    1540282121025, "2018-10-23 08:08:41.025",
                    "2018-10-23 08:08:41.025"
                ],
            ],
            columns=["date", "date_dt", "date_dt_cp"],
        )

        columns = ["date_dt", "date_dt_cp"]
        expected[columns] = expected[columns].apply(pd.to_datetime)

        tm.assert_frame_equal(df, expected)
Exemplo n.º 48
0
    def get_index_daily(self,
                        code: str,
                        start_date: str = '',
                        end_date: str = ''):
        """ 获取指数日线数据

        :param code: [description]
        :type code: str
        :param start_date: [description], defaults to ''
        :type start_date: str, optional
        :param end_date: [description], defaults to ''
        :type end_date: str, optional
        :raises QuantzException: [description]
        :return: [description]
        :rtype: [type]
        """
        if code is None or '' == code:
            raise QuantzException(
                'Failed to get_index_daily(Index code is empty)')
        if end_date == '':
            end_date = utils.now_2_YYYYmmdd()
        if (start_date != '' and int(start_date) > int(end_date)):
            raise QuantzException(
                'Failed to get_index_daily(start_date must gte end_date)')
        if not self._is_data_available(code, start_date, end_date):
            _logd('Not All data available')
            if not self._obtain_delta_data(code, start_date, end_date):
                _logw('Failed to obtain data for %s' % (code))
                raise QuantzException(
                    'Could not get full data for %s from %s to %s' %
                    (code, start_date, end_date))
        index_objects = IndexDailyItem.objects(
            ts_code=code, trade_date__gte=start_date,
            trade_date__lte=end_date).order_by('-trade_date')
        index_df = DataFrame.from_dict(json.loads(index_objects.to_json()))
        if index_df.shape[1] > 1:
            index_df = index_df.drop('_id', axis=1)
        return index_df
Exemplo n.º 49
0
def df_open_now(country, end=None, f=None, interval='month'):
    response = search_open_now(country, end=end, f=f)

    models = [x.key for x in response.aggs.models.buckets]

    obj = {x: {0} for x in models}

    for model in response.aggs.models.buckets:
        obj[model.key] = {model.doc_count}

    df = DataFrame.from_dict(obj,
                             orient='index',
                             dtype='int64',
                             columns=['now'])
    df.loc['total'] = df.sum()

    if df.empty:
        return df

    df.index.name = 'date'
    df = df.sort_index()

    return df
Exemplo n.º 50
0
def df(country, f=None, **kwargs):
    today = dt.datetime.today().strftime('%Y-%m-%d')
    date = kwargs['date'] if 'date' in kwargs else today
    response = search(country, f=f, date=date)

    obj = {}
    for model in response.aggs.model.buckets:
        obj[model.key] = {}
        for ur in model.ur30.buckets:
            obj[model.key][round(ur.key,2)] = ur.doc_count

    df = DataFrame.from_dict(obj).T

    if df.empty:
        return df

    df.index.name = 'modelo'
    df = df.fillna(0).astype('int64')
    df['total'] = df.sum(axis=1)
    df.loc['total'] = df.sum()
    df = df.reset_index()

    return df
def train_new():
    global ensemble
    content = request.json
    label = content['labels']
    event = content['events']
    label_df = DataFrame([[1,label]],columns=['Index','Label'] )
    # label_df = DataFrame.from_dict(label)
    event_df = DataFrame.from_dict(event)

    # print(event_df)
    # print(label_df)
    with open('events_new.csv', 'a') as f:
        event_df.to_csv(f, header=False) # True if first event
    with open('labels_new.csv', 'a') as f:
        label_df.to_csv(f, header=False) # True if first event
    # new_set = test_set.append(event , ignore_index=True)
    # preds = ensemble.predict(new_set)
    # preds = DataFrame(preds)
    # preds = preds.replace(1.0,'Normal')
    # preds = preds.replace(-1.0,'Malicious')
    # js = preds.to_json()
    # print(preds)
    return "OK"
Exemplo n.º 52
0
    def list_privileges(self,
                        to_dataframe: bool = False) -> Union[dict, DataFrame]:
        """List ALL privileges for Security Role. Optionally return a
        `DataFrame` object.

        Args:
            to_dataframe: If True, return a `DataFrame` object containing
                privileges
        """
        self.fetch()
        priv_dict = {
            int(v[1]): k[1]
            for k, v in [x.items() for x in self.privileges]
        }

        if to_dataframe:
            df = DataFrame.from_dict(priv_dict,
                                     orient='index',
                                     columns=['Name'])
            df.index.name = 'ID'
            return df
        else:
            return priv_dict
Exemplo n.º 53
0
def df(country, start=None, end=None, f=None, interval='month'):
    if country not in COUNTRY_LIST:
        raise Exception(f'{country} is not a valid country')

    response = search(country, start=start, end=end, f=f, interval=interval)

    obj = {}
    for interval in response.aggs.dates.buckets:
        obj[interval.key_as_string] = {}
        for status in interval.status.buckets:
            obj[interval.key_as_string][status.key] = status.doc_count

    df = DataFrame.from_dict(obj, orient='index')

    if df.empty:
        return df

    df.index = df.index.astype('datetime64')
    df.index.name = 'date'
    df = df.fillna(0).astype('int64')
    df['total'] = df.sum(axis=1)

    return df
Exemplo n.º 54
0
def predict(data, model=load_model()):
    """
    Returns prediction given the model and data to predict
    Parameters
    ----------
    model: Model instance returned by load_model API
    data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Panda DataFrame
    Returns
    -------
    predictions: Output from scoring server
        Format: {'prediction':output from model.predict method}
    """

    # This is the default implementation of the predict() function specific to this score.py template only.
    if model == "default_model" or len(data) == 0:
        return {'prediction': 'Hello world!'}

    from pandas import read_json, DataFrame
    from io import StringIO
    data = read_json(StringIO(data)) if isinstance(
        data, str) else DataFrame.from_dict(data)
    pred = model.predict(data).tolist()
    return {'prediction': pred}
Exemplo n.º 55
0
def parse_many_spectra(spectra_dir: str,
                       prefix: str = '',
                       wave_info=(501, 3996, 1715)) -> DataFrame:
    all_spectra = {}
    unparsed_spectra = 0
    spectra_names = listdir(spectra_dir)
    spectra_dir = Path(spectra_dir)

    for spectrum_name in tqdm(spectra_names):
        try:
            spectrum_path = spectra_dir / spectrum_name
            spectrum = opus_reader(spectrum_path)
            absorbance = spectrum.interpolate(*wave_info)[1]
            all_spectra[change_ssn(spectrum_name, prefix)] = absorbance
        except Exception:
            unparsed_spectra += 1

    print(f'Parsing finished. ' f'{unparsed_spectra} spectra not parsed')
    columns = np.linspace(*wave_info) \
        .astype(str)
    spectra_df = DataFrame.from_dict(all_spectra, orient='index')
    spectra_df.columns = columns
    return spectra_df
Exemplo n.º 56
0
def make_criteria_csv():
    """
    Make criteria tables:
        _build/csv/{all,axes,coords}_criteria.csv
    """

    csv_dir = "_build/csv"
    os.makedirs(csv_dir, exist_ok=True)

    # Criteria tables
    df = DataFrame.from_dict(coordinate_criteria)
    df = df.dropna(axis=1, how="all")
    df = df.applymap(lambda x: ", ".join(sorted(x)) if isinstance(x, tuple) else x)
    df = df.sort_index(axis=0).sort_index(axis=1)

    # All criteria
    df.to_csv(os.path.join(csv_dir, "all_criteria.csv"))

    # Axes and coordinates
    for keys, name in zip([_AXIS_NAMES, _COORD_NAMES], ["axes", "coords"]):
        subdf = df[sorted(keys)].dropna(axis=1, how="all")
        subdf = subdf.dropna(axis=1, how="all").transpose()
        subdf.to_csv(os.path.join(csv_dir, f"{name}_criteria.csv"))
Exemplo n.º 57
0
def index_models_bigg():
    try:
        response = requests.get('http://bigg.ucsd.edu/api/v2/models',
                                timeout=3)
    except requests.ConnectionError as e:
        logger.error(
            "Cannot reach http://bigg.ucsd.edu. Are you sure that you are connected to the internet?"
        )
        raise e
    if response.ok:
        try:
            json = response.json()
        except Exception as e:
            logger.error(
                'No json could be decoded from server response coming from http://bigg.ucsd.edu.'
            )
            raise e
        else:
            return DataFrame.from_dict(json['results'])
    else:
        raise Exception(
            "Could not index available models. bigg.ucsd.edu returned status code {}"
            .format(response.status_code))
Exemplo n.º 58
0
    def describe(self) -> DataFrame:
        """Describe a TimeSeriesDataset with the describe function from Pandas

        TODO Define what describe should do on TimeSeriesDataset (see issue 56)

        Returns:
            TODO Define return type
        """
        min = self.min()
        max = self.max()
        mean = self.mean()
        median = self.median()
        kurtosis = self.kurtosis()
        skewness = self.skewness()

        return DataFrame.from_dict({
            'minimum': min,
            'maximum': max,
            'mean': mean,
            'median': median,
            'kurtosis': kurtosis,
            'skewness': skewness
        })
Exemplo n.º 59
0
Arquivo: lib.py Projeto: endrebak/kg
def get_pathway_to_definition_map(species):
    """Map kegg paths to their definition."""

    kegg_list = REST.kegg_list("pathway", species)

    clean_kegg_path = re.compile(r"path:{}|\n".format(species))

    rowdicts = []
    for kegg_path_line in kegg_list:

        try:
            kegg_path_line = kegg_path_line.decode("utf-8")
        except AttributeError:
            pass

        kegg_info = re.sub(clean_kegg_path, "", kegg_path_line)
        pathway, definition = kegg_info.split("\t")
        definition = definition.split(" - ")[0]  # Remove species info
        rowdict = {"kegg_pathway": pathway,
                   "kegg_pathway_definition": definition}
        rowdicts.append(rowdict)

    return DataFrame.from_dict(rowdicts)
Exemplo n.º 60
0
def user_top_recommended_stories(
        context, recommender_model: TruncatedSVD,
        user_story_matrix: IndexedCooMatrix) -> DataFrame:
    """The top stories for each commenter (user)."""
    # Compute XV, which has a row for each user and a column for each component
    XV = recommender_model.transform(user_story_matrix.matrix)

    # Now we want to project XV back into story-space.  As a dense matrix, the product would be way
    # too big - | # users * # stories|, so we sparsify both the multiplicands to make it more
    # manageable.
    XV[np.abs(XV) < 1] = 0
    sparse_XV = csr_matrix(XV)
    context.log.info(f"sparse_XV shape: {sparse_XV.shape}")
    context.log.info(f"sparse_XV non-zero: {sparse_XV.count_nonzero()}")

    recommender_model.components_[
        np.abs(recommender_model.components_) < 1e-2] = 0
    sparse_components = csc_matrix(recommender_model.components_)
    context.log.info(
        f"recommender_model.components_ shape: {recommender_model.components_.shape}"
    )
    context.log.info(
        f"sparse_components non-zero: {sparse_components.count_nonzero()}")

    # A matrix with the same dimensions as user_story_matrix, but reduced in rank
    X_hat = sparse_XV @ sparse_components

    coo = coo_matrix(X_hat)
    story_ids = user_story_matrix.col_index[coo.col].values
    user_ids = user_story_matrix.row_index[coo.row].values
    context.log.info(f"recommendations: {len(story_ids)}")

    return DataFrame.from_dict({
        "user_id": user_ids,
        "story_id": story_ids,
        "relevance": coo.data
    })