示例#1
0
文件: bike.py 项目: kowalczewski/Bike
def plots_workingTrends():

	# holiday = 0 and workday = 0 => weekend
	# let's see if holidays and weekends give the same trends

	# Day trends -- working vs. non-working day
	hours = np.linspace(0,23,24)

	days_average = DataFrame({'Hour': hours})

	# workdays
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 1) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Working day': mean_vec}))

	# holidays or weekends
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 0) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Non-working day': mean_vec}))

	days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16)
	plt.xlabel('Hour', fontsize=16)
	plt.ylabel('Average counts', fontsize=16)
	plt.legend(loc='best', fontsize=16)
	plt.show()
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
def datatype_records_to_subset_and_migrate(likechars):
    stmt_for_pkeys = conn_popler_2.execute(
        select(
            from_obj=Maintable,
            columns=[
                column('lter_proj_site'),
                column('samplingprotocol')
            ]).
        where(
            column('samplingprotocol').like(
                '%{}%'.format(likechars))
        )
    )
    data = DataFrame(stmt_for_pkeys.fetchall())
    data.columns = stmt_for_pkeys.keys()

    records_to_get = data['lter_proj_site'].values.tolist()

    stmt_for_records = conn_popler_2.execute(
        select(
            from_table=Rawtable,
        ).
        where(column('lter_proj_site').in_(records_to_get)).
        order_by('sampleid')
    )
    data2 = DataFrame(stmt_for_records.fetchall())
    data2.columns = stmt_for_records.keys()
    data2.drop('individ', axis=1, inplace=True)
示例#5
0
    def test_mixed_depth_drop(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        result = df.drop('a',axis=1)
        expected = df.drop([('a','','')],axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(['top'],axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        expected = expected.drop([('top','OD','wy')], axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(('top', 'OD', 'wx'), axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        assert_frame_equal(expected, result)

        expected = df.drop([('top','OD','wy')], axis=1)
        expected = df.drop('top', axis=1)

        result = df.drop('result1', level=1, axis=1)
        expected = df.drop([('routine1', 'result1', ''),
                            ('routine2', 'result1', '')], axis=1)
        assert_frame_equal(expected, result)
示例#6
0
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None):
    from pandas import DataFrame

    data = DataFrame(data, dtype=dtype)
    names = data.columns

    if isinstance(endog_idx, int):
        endog_name = names[endog_idx]
        endog = data[endog_name]
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data.filter(names[exog_idx])
    else:
        endog = data.ix[:, endog_idx]
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, int):
            exog = data.filter([names[exog_idx]])
        else:
            exog = data.filter(names[exog_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
                      endog_name=endog_name, exog_name=exog_name)
    return dataset
示例#7
0
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None,
                            index_idx=None):

    data = DataFrame(data, dtype=dtype)
    names = data.columns

    if isinstance(endog_idx, (int, long)):
        endog_name = names[endog_idx]
        endog = data[endog_name]
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data.filter(names[exog_idx])
    else:
        endog = data.loc[:, endog_idx]
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, (int, long)):
            exog = data.filter([names[exog_idx]])
        else:
            exog = data.filter(names[exog_idx])

    if index_idx is not None:  # NOTE: will have to be improved for dates
        endog.index = Index(data.iloc[:, index_idx])
        exog.index = Index(data.iloc[:, index_idx])
        data = data.set_index(names[index_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
                      endog_name=endog_name, exog_name=exog_name)
    return dataset
def clicksDataframe(clicks_data):
    clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks'])
    clicks_dataframe = clicks_dataframe.apply(to_numeric, errors='ignore')
    clicks_dataframe.drop('date', axis=1, inplace=True)
    clicks_dataframe = clicks_dataframe.groupby(['cardName','position']).sum().sort_values(by='uniqueClicks',ascending=0)
    clicks_dataframe.reset_index(inplace=True)

    return clicks_dataframe
示例#9
0
def scale_features(df: DataFrame):
    spec_features = ['Fare']
    scaler = StandardScaler()
    for sf in spec_features:
        scale_param = scaler.fit(df[sf].reshape(-1, 1))
        df[sf + '_scaled'] = scaler.fit_transform(df[sf].reshape(-1, 1), scale_param)
    df.drop(labels=spec_features, axis=1, inplace=True)
    return df
示例#10
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
示例#11
0
def set_dummy_vars(df: DataFrame):
    df.drop(labels=['Name'], axis=1, inplace=True)
    discrete_features = list(df.dtypes[df.dtypes == 'object'].index)
    discrete_features.append('Pclass')
    dummies = [pd.get_dummies(df[f], prefix=f) for f in discrete_features]
    dummies.insert(0, df)
    df = pd.concat(dummies, axis=1)
    df.drop(labels=discrete_features, axis=1, inplace=True)
    return df
示例#12
0
    def generateGraphData(self):
        safePrint('Generating and uploading data files')

        allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True)
        xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00']

        # filter type==DATA and site==wikipedia
        allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')]

        # By "iszero+via", e.g.  a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera
        data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum))
        data.reset_index(inplace=True)
        data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1)
        data.drop('iszero', axis=1, inplace=True)
        self.createClippedData('RawData:YearDailyViaIsZero', data)
        self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly)
        self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly)

        allowedSubdomains = ['m', 'zero']
        data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))]
        data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        self.createClippedData('RawData:YearDailySubdomains', data)
        self.createPeriodData('RawData:WeeklySubdomains', data, weekly)
        self.createPeriodData('RawData:MonthlySubdomains', data, monthly)

        # create an artificial yes/no/opera sums
        opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')]
        opera['str'] = 'o'
        yes = allData[allData.iszero == 'y']
        yes['str'] = 'y'
        no = allData[allData.iszero == 'n']
        no['str'] = 'n'
        combined = opera.append(yes).append(no)
        data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        headerFields = 'date,xcs,iszero,count'  # Override "str" as "iszero"
        self.createClippedData('RawData:YearDailyTotals', data, headerFields)
        self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields)

        data = []
        for xcsId in list(allData.xcs.unique()):
            byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \
                .order('count', ascending=False)
            top = byLang.head(5)
            vals = list(top.iteritems())
            vals.append(('other', byLang.sum() - top.sum()))
            valsTotal = sum([v[1] for v in vals]) / 100.0
            data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals])

        self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
示例#13
0
 def _one_hot_encoding(df: pd.DataFrame, features: list) -> pd.DataFrame:
     """
     help method for one hot encoding
     """
     for feature in features:
         one_hot = pd.get_dummies(df[feature], feature, '_')
         # And the next two statements 'replace' the existing feature_selection by the new binary-valued features
         # First, drop the existing column
         df.drop(feature, axis=1, inplace=True)
         # Next, concatenate the new columns. This assumes no clash of column names.
         df = pd.concat([df, one_hot], axis=1)
     return df
示例#14
0
def data_prep(input_file, bad_samples_file, freq_dict=None):
    '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele
    if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py'''
    min_snpD = 10
    tri_allele= 0
    
    output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt'
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    bad_samples = [sample.strip() for sample in open(bad_samples_file)]                                              
    df = DataFrame(read_csv(input_file, sep = '\t'))
    #remove bad samples
    df.drop(bad_samples, inplace = True, axis =1)
    #remove non-biallelic alleles
    #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    '''#remove SNPs that are too close to one another
    df['diff'] = df.groupby('chrom')['pos'].diff()
    df.fillna('first', inplace = True)
    #df.to_csv('test_df.txt', sep = '\t')
    # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM
    df = df.query('diff > 10 or diff == "first"')
    df.drop('diff', axis = 1, inplace = True)'''
    
    if not freq_dict:
        #calculate the major and minor allele
        major = df.apply(major_find, axis =1 )
        minor = df.apply(minor_find, axis =1 )
        major_prop = df.apply(major_prop_find, axis =1 )
        minor_prop = df.apply(minor_prop_find, axis = 1)
    else:
        snp_dict = json.load(open(freq_dict))
        df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str)        
        major = df['keys'].apply(lambda x : snp_dict[x]['major'])
        major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq'])
        minor = df['keys'].apply(lambda x : snp_dict[x]['minor'])
        minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq'])
        
        df.drop('keys', inplace= True, axis = 1)
               
        
        
    #inserting this stuff into dataframe for future use
    df.insert(3, 'minor_prop', minor_prop)
    df.insert(3, 'minor', minor)
    df.insert(3, 'major_prop', major_prop)
    df.insert(3, 'major', major)
    
    df.to_csv(output_file, sep = '\t', index= False)
    return df
示例#15
0
def homePageToSubjectPageDataframe(data):
    subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews'])
    subject_dataframe = subject_dataframe.apply(to_numeric, errors='ignore')
    subject_dataframe.drop('date', axis=1, inplace=True)
    subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort_values(by='uniqueViews',ascending=0)
    subject_dataframe.reset_index(inplace=True)
    subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title))
    subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum()
    subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews'])
    subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)]

    return subject_dataframe[['subject','uniqueViews','Pct']]
示例#16
0
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50):
    exclude_words, duplicate_sets = load_filter_tags(exclude_tags)
    with open(tag_pickle, 'r') as f:
        t = DataFrame(pickle.load(f)['result']).set_index('_id')
    for setn in duplicate_sets:
        t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:]))
        for tag in setn[1:]:
            t.drop(tag, inplace=True)
    for tag in exclude_words:
        t.drop(tag, inplace=True)
    t.sort(ascending=False)
    return t[:n].index
示例#17
0
def pd_dataframe6():
    obj=DataFrame(np.arange(5.),index=['a','b','c','d','e'])
    print obj
    new_obj=obj.drop('c')
    print new_obj
    print obj.drop(['b','c'])
    data = DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
    print data
    print data.drop(['Ohio', 'Colorado'])
    print data.drop('two',axis=1)
    print data.drop(['two','four'],axis=1)
示例#18
0
文件: nbody.py 项目: hummel/gadfly
 def load_velocities(self, unit=None):
     """
     Load Particle Velocities in units of km/s (default set in units class)
     unit: unit conversion from code units
     """
     if unit:
         self.units.set_velocity(unit)
     uvw = self._velocities.value * self.units.velocity_conv
     if self.units.coordinate_system == 'physical':
         a = self._header.ScaleFactor
         uvw *= numpy.sqrt(a)
     uvw = DataFrame(uvw, index=self._particleIDs.value, columns=['u', 'v', 'w'])
     if self._drop_ids is not None:
         uvw.drop(self._drop_ids, inplace=True)
     self[['u', 'v', 'w']] = uvw
def dataset_transformation(df : pd.DataFrame, testData : pd.DataFrame):

    #Age Transform
    df['AgeuponOutcome'] =  df['AgeuponOutcome'].apply(calculateAge)
    testData['AgeuponOutcome'] = testData['AgeuponOutcome'].apply(calculateAge)

    #name transform
    df['Name'] = df['Name'].apply(processName)
    testData['Name'] = testData['Name'].apply(processName)

    #df = df.apply(setMissingAge, axis=1)
    #testData = testData.apply(setMissingAge, axis=1)

    #encodeFeature(df,'OutcomeType')

    #Animal Tpye transform
    le = encodeFeature(df,testData,'AnimalType')


    #sex transform
    le =  encodeFeature(df,testData,'SexuponOutcome')
    le =  encodeFeature(df,testData,'SexuponOutcome1')


    #Breed transform
    le = encodeFeature(df,testData,'Breed')
    le = encodeFeature(df, testData, 'Breed1')
    le = encodeFeature(df, testData, 'Breed2')

    #color.
    le = encodeFeature(df,testData,'Color')
    le = encodeFeature(df, testData, 'Color1')
    le = encodeFeature(df, testData, 'Color2')


    #encodeFeature(df,'Breed1')
    #encodeFeature(df,'Breed2')
    #encodeFeature(df,'Breedcount')
    #encodeFeature(df,'Name')

    df =  df.drop(['DateTime'], axis=1)
    testData = testData.drop(['DateTime'], axis=1)

    df = df.drop(['Name'], axis=1)
    testData = testData.drop(['Name'], axis=1)


    return [df,testData]
示例#20
0
def thread_participation_evolution(
        pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        title = "Participation per thread in {} (threshold = {})".format(
            project, n)
    else:
        thread_type = 'research threads'
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(project, n)
    data = pm_frame.loc[project][['basic', thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, 'authors'])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, 'authors'].apply(
            lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(),
                                              ascending=False)
    author_thread = author_thread.drop(
        "Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
示例#21
0
def project_participation_evolution(
        pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(n)
    else:
        thread_type = 'research threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(
            *data['research threads', 'authors (accumulated)'])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(n)
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[
            thread_type, 'authors (accumulated)'].apply(
                lambda project, author=author: author in project)
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(),
                                                ascending=False)
    author_project = author_project.drop(
        "Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
示例#22
0
文件: test_link.py 项目: alexlib/mr
    def test_two_isolated_steppers_one_gapped(self):
        N = 5
        Y = 25
        # Begin second feature one frame later than the first, so the probe labeling (0, 1) is
        # established and not arbitrary.
        a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)})
        a = a.drop(3).reset_index(drop=True)
        b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)})
        f = pd.concat([a, b])
        actual = self.link(f, 5)
        expected = f.copy()
        expected['probe'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)])
        expected.sort(['probe', 'frame'], inplace=True)
        expected.reset_index(drop=True, inplace=True)
        assert_frame_equal(actual, expected)

        # Sort rows by frame (normal use)
        actual = self.link(f.sort('frame'), 5)
        assert_frame_equal(actual, expected)

        # Shuffle rows (crazy!)
        np.random.seed(0)
        f1 = f.reset_index(drop=True)
        f1.reindex(np.random.permutation(f1.index))
        actual = self.link(f1, 5)
        assert_frame_equal(actual, expected)
示例#23
0
def training(iden, Charg, Temps, use_cache_trainingset, test, verbose):
    ''' Return the prediction function, 
    for a given site iden, history Charg and temperature Temps'''
    if use_cache_trainingset:
        if test:
            X = pickle.load(open(CACHE_DIR+"X_test_"+iden+".p", "rb"))
        else:
            X = pickle.load(open(CACHE_DIR+"X_"+iden+".p", "rb"))
    else:
        X = DataFrame(Charg[iden])
        X = X.dropna(how='any')
        X['dayofweek'] = X.index.dayofweek
        X['Temps'] = Temps[iden].ix[X.index]
        X['fracday'] = X.index.minute/60.+X.index.hour
        X['lastminutes'] = X[iden].ix[X.index-10*Minute()].values
        X['yesterday'] = X[iden].ix[X.index-Day()].values
        X['yesterdaybis'] = X[iden].ix[X.index-Day()-10*Minute()].values
        X['lastweek'] = X[iden].ix[X.index-Week()].values
        X['lastweekbis'] = X[iden].ix[X.index-Week()-10*Minute()].values
        if test:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
        else:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
    X = X.dropna(how='any')
    y = X[iden]
    X = X.drop(iden, 1)
    scalerX = preprocessing.StandardScaler().fit(X)
    ##############################
    clf = linear_model.SGDRegressor(alpha = 0.000001,n_iter=3000)
    ##############################
    clf.fit(scalerX.transform(X), y)
    if verbose:
        print('Function for '+iden+' computed.')
    return(lambda x :clf.predict(scalerX.transform(x)))
示例#24
0
def test_cythonized_aggers(op_name):
    data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan],
            'B': ['A', 'B'] * 6,
            'C': np.random.randn(12)}
    df = DataFrame(data)
    df.loc[2:10:2, 'C'] = np.nan

    op = lambda x: getattr(x, op_name)()

    # single column
    grouped = df.drop(['B'], axis=1).groupby('A')
    exp = {cat: op(group['C']) for cat, group in grouped}
    exp = DataFrame({'C': exp})
    exp.index.name = 'A'
    result = op(grouped)
    tm.assert_frame_equal(result, exp)

    # multiple columns
    grouped = df.groupby(['A', 'B'])
    expd = {}
    for (cat1, cat2), group in grouped:
        expd.setdefault(cat1, {})[cat2] = op(group['C'])
    exp = DataFrame(expd).T.stack(dropna=False)
    exp.index.names = ['A', 'B']
    exp.name = 'C'

    result = op(grouped)['C']
    if op_name in ['sum', 'prod']:
        tm.assert_series_equal(result, exp)
示例#25
0
    def test_two_nearby_steppers_one_gapped(self):
        N = 5
        Y = 2
        # Begin second feature one frame later than the first, so the particle labeling (0, 1) is
        # established and not arbitrary.
        a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)})
        b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)})
        a = a.drop(3).reset_index(drop=True)
        f = pd.concat([a, b])
        expected = f.copy().reset_index(drop=True)
        expected['particle'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)])
        pandas_sort(expected, ['particle', 'frame'], inplace=True)
        expected.reset_index(drop=True, inplace=True)
        actual = self.link_df(f, 5)
        assert_frame_equal(actual, expected)
        actual_iter = self.link_df_iter(f, 5, hash_size=(50, 50))
        assert_frame_equal(actual_iter, expected)

        # Sort rows by frame (normal use)
        actual = self.link_df(pandas_sort(f, 'frame'), 5)
        assert_frame_equal(actual, expected)
        actual_iter = self.link_df_iter(pandas_sort(f, 'frame'), 5, hash_size=(50, 50))
        assert_frame_equal(actual_iter, expected)

        # Shuffle rows (crazy!)
        np.random.seed(0)
        f1 = f.reset_index(drop=True)
        f1.reindex(np.random.permutation(f1.index))
        actual = self.link_df(f1, 5)
        assert_frame_equal(actual, expected)
        actual_iter = self.link_df_iter(f1, 5, hash_size=(50, 50))
        assert_frame_equal(actual_iter, expected)
示例#26
0
    def load_abundances(self, tracked_species=None):
        """
        Load chemical abundances array.

        There are six abundances tracked for each particle.
        0:H2 1:HII 2:DII 3:HD 4:HeII 5:HeIII
        """
        default_species = ['H2', 'HII', 'DII', 'HD', 'HeII', 'HeIII']
        if tracked_species is None:
            tracked_species = default_species
        abundances = self._ChemicalAbundances.value
        abundances = DataFrame(abundances, index=self._particleIDs.value,
                               columns=tracked_species)
        if self._drop_ids is not None:
            abundances.drop(self._drop_ids)
        self[tracked_species] = abundances
示例#27
0
    def test_two_isolated_steppers_one_gapped(self):
        N = 5
        Y = 25
        # Begin second feature one frame later than the first,
        # so the particle labeling (0, 1) is established and not arbitrary.
        a = DataFrame({'x': np.arange(N), 'y': np.ones(N),
                      'frame': np.arange(N)})
        a = a.drop(3).reset_index(drop=True)
        b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1),
                      'frame': np.arange(1, N)})
        f = pd.concat([a, b])
        expected = f.copy()
        expected['particle'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)])
        pandas_sort(expected, ['particle', 'frame'], inplace=True)
        expected.reset_index(drop=True, inplace=True)
        actual = self.link(f, 5)
        assert_traj_equal(actual, expected)
        # link_df_iter() tests not performed, because hash_size is
        # not knowable from the first frame alone.

        # Sort rows by frame (normal use)
        actual = self.link(pandas_sort(f, 'frame'), 5)
        assert_traj_equal(actual, expected)

        # Shuffle rows (crazy!)
        np.random.seed(0)
        f1 = f.reset_index(drop=True)
        f1.reindex(np.random.permutation(f1.index))
        actual = self.link(f1, 5)
        assert_traj_equal(actual, expected)
示例#28
0
	def get_quote(self, symbols, dataframe = True):
		if isinstance(symbols, list) or isinstance(symbols, set) or isinstance(symbols, tuple):
			symbolList = list(symbols)
		elif isinstance(symbols, str):
			symbolList = symbols.split(',')
		symbols = util.symbols_to_string(symbols)
		url = URL_QUOTATION(symbols)
		retry = True
		while retry:
			try:
				quote  =self.session.get(
						URL_QUOTATION(symbols)
					,	timeout = 0.1
					).text
				retry = False
			except:
				pass
		quoteList = re.findall(r'\"(.*)\"', quote)
		if dataframe:
			for i in range( 0, len(quoteList) ):
				quoteList[i] = quoteList[i].split(',')
		else:
			for i in range( 0, len(quoteList) ):
				quoteList[i] = quoteList[i].split(',')
				quoteList[i].append( symbolList[i] )

		if dataframe:
			df_quote = DataFrame( quoteList, columns = SINA_QUOTE_COLUMNS )
			df_quote = df_quote.drop( 'ms', axis = 1 )
			df_quote["symbol"] = symbolList
			return df_quote
		else:
			return quoteList
示例#29
0
    def test_v12_compat(self):
        df = DataFrame(
            [
                [1.56808523, 0.65727391, 1.81021139, -0.17251653],
                [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
                [1.51493992, 0.11805825, 1.629455, -1.31506612],
                [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
                [0.05951614, -2.69652057, 1.28163262, 0.34703478],
            ],
            columns=["A", "B", "C", "D"],
            index=pd.date_range("2000-01-03", "2000-01-07"),
        )
        df["date"] = pd.Timestamp("19920106 18:21:32.12")
        df.ix[3, "date"] = pd.Timestamp("20130101")
        df["modified"] = df["date"]
        df.ix[1, "modified"] = pd.NaT

        v12_json = os.path.join(self.dirpath, "tsframe_v012.json")
        df_unser = pd.read_json(v12_json)
        assert_frame_equal(df, df_unser)

        df_iso = df.drop(["modified"], axis=1)
        v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json")
        df_unser_iso = pd.read_json(v12_iso_json)
        assert_frame_equal(df_iso, df_unser_iso)
示例#30
0
def get_flights_from_route(cur, origin, destination):
    """
    Returns a dataframe for all flights matching origin, destination.
    """

    import time
    
    ### MySQL query
    time0 = time.time()
    cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination))
    rows = cur.fetchall()
    td = time.time() - time0
    print 'Database query took %.2f seconds.' % td
    
    ### Convert to dataframe
    df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay'])

    ### Drop columns without delays (cancellations)
    df = df.dropna()
    
    ### Create some auxiliary columns
    df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['DepHour'] = df['CRSDepTime']/100

    ### Drop unused columns
    df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1)

    ## df.head()
    
    return df
示例#31
0
def _fix_dframe_for_libecl(dframe: pd.DataFrame) -> pd.DataFrame:
    """Fix a dataframe making it ready for EclSum.from_pandas()

    * Ensures that the index is always datetime, and sorted.
    * Removes BLOCK vectors, these are currently not supported as
      it requires knowledge of the grid dimensions. Warnings
      will be emitted for skipped columns

    Args:
        dframe: Dataframe to read. Will not be modified.

    Returns:
        Modified copy of incoming dataframe.
    """
    if dframe.empty:
        return dframe
    dframe = dframe.copy()
    if "DATE" in dframe.columns:
        # Infer datatype (Pandas cannot answer it) based on the first element:
        if isinstance(dframe["DATE"].values[0], pd.Timestamp):
            dframe["DATE"] = pd.Series(pd.to_pydatetime(dframe["DATE"]),
                                       dtype="object")
        if isinstance(dframe["DATE"].values[0], str):
            # Do not use pd.Series.apply() here, Pandas would try to convert it to
            # datetime64[ns] which is limited at year 2262.
            dframe["DATE"] = pd.Series(
                [dateutil.parser.parse(datestr) for datestr in dframe["DATE"]],
                dtype="object",
                index=dframe.index,
            )
        if isinstance(dframe["DATE"].values[0], dt.date):
            dframe["DATE"] = pd.Series(
                [
                    dt.datetime.combine(dateobj, dt.datetime.min.time())
                    for dateobj in dframe["DATE"]
                ],
                dtype="object",
                index=dframe.index,
            )

        dframe.set_index("DATE", inplace=True)
    if not isinstance(dframe.index.values[0],
                      (dt.datetime, np.datetime64, pd.Timestamp)):
        raise ValueError(
            "dataframe must have a datetime index, got %s of type %s" %
            (dframe.index.values[0], type(dframe.index.values[0])))
    dframe.sort_index(axis=0, inplace=True)

    # This column will appear if dataframes are naively written to CSV
    # files and read back in again.
    if "Unnamed: 0" in dframe:
        dframe.drop("Unnamed: 0", axis="columns", inplace=True)

    block_columns = [
        col for col in dframe.columns
        if (col.startswith("B") or col.startswith("LB"))
    ]
    if block_columns:
        dframe = dframe.drop(columns=block_columns)
        logger.warning(
            "Dropped columns with block data, not supported: %s",
            str({
                colname.partition(":")[0] + ":*"
                for colname in block_columns
            }),
        )

    return dframe
示例#32
0
def _prepare_and_save_submit(submit: pd.DataFrame, config: InferenceConfig):
    submit['Id'] = submit['image_name']
    submit.drop(['kind', 'image_name', 'label'], axis=1, inplace=True)
    submit.to_csv(config.sumbit_name, index=False)
示例#33
0
def gen_interaction_df(df: pd.DataFrame, response: str, degree: int=2, inter_only: bool=False, bias: bool=False):

    x_interaction = PolynomialFeatures(degree=2, interaction_only=inter_only, include_bias=False).fit_transform(df.drop(columns=response))
    interaction_df = pd.DataFrame(x_interaction, columns=gen_column_names(df.drop(columns=response), inter_only))
    return interaction_df.join(df[response])
X, y = make_moons(n_samples=10000, noise=0.2)
df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))

# scatter plot, dots colored by class value
colors = {0:'red', 1:'blue'}
fig, ax = plt.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key],s=1)
plt.show()


#------------------ regressão logistica

log_reg = LogisticRegression()
log_reg.fit(df.drop('label',axis=1), df['label'])


# AUC
probs = log_reg.predict_proba(df.drop('label',axis=1))
probs = probs[:, 1]
roc_auc_score(df['label'], probs)


#----------------- ploting decision boundary

#creating grid
xx, yy = np.mgrid[-3:3:.01, -3:3:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = log_reg.predict_proba(grid)[:, 1].reshape(xx.shape)
示例#35
0
def df_node_to_cooc(df_dump, context=None):
    """ converts df_dump to df_cooc + f1_set

    strategy:
    (1a) create overlapping contexts with nodes
    (1b) concatenate local contexts
    (2a) sort by abs(offset)
    (2b) deduplicate by cpos, keep first occurrences (=smallest offset)
    (3a) f1_set = (cpos where offset == 0)
    (3b) remove rows where cpos in f1_set

    NB: equivalent to UCS when switching steps 3a and 3b

    :param DataFrame df_dump: [match, matchend] + [context_id, context, contextend]
    :param int context: confine context?

    :return: deduplicated df_cooc [match, cpos, offset] + f1_set (cpos of nodes)
    :rtype: tuple(DataFrame, set)
    """

    if context == 0:
        logger.warning("can't work on 0 context")
        return DataFrame(), set()

    # reset the index to be able to work with it
    df = df_dump.reset_index()

    if context is None:
        if (df['match'].values == df['context'].values).all() and (
                df['matchend'].values == df['contextend'].values).all():
            return DataFrame(), set()
        else:
            df['start'] = df['context']
            df['end'] = df['contextend']

    else:
        logger.info("re-confine regions by given context")
        df['start'] = df['match'] - context
        df['start'] = df[['start', 'context']].max(axis=1)
        df['end'] = df['matchend'] + context
        df['end'] = df[['end', 'contextend']].min(axis=1)

    logger.info("(1a) create local contexts")
    df = DataFrame.from_records(df.apply(node2cooc, axis=1).values)

    logger.info("(1b) concatenate local contexts")
    df_infl = DataFrame({
        'match':
        list(chain.from_iterable(df['match_list'].values)),
        'cpos':
        list(chain.from_iterable(df['cpos_list'].values)),
        'offset':
        list(chain.from_iterable(df['offset_list'].values))
    })

    logger.info("(2a) sort by absolute offset")
    df_infl['abs_offset'] = df_infl.offset.abs()
    df_infl = df_infl.sort_values(by=['abs_offset', 'cpos'])
    df_infl = df_infl.drop(["abs_offset"], axis=1)

    logger.info("(2b) drop duplicates")
    df_defl = df_infl.drop_duplicates(subset='cpos')

    logger.info("(3a) identify nodes ...")
    f1_set = set(df_defl.loc[df_defl['offset'] == 0]['cpos'])

    logger.info("(3b) ... and remove them")
    df_defl = df_defl[df_defl['offset'] != 0]

    return df_defl, f1_set
示例#36
0
cur.execute(SQL1)
gdp=cur.fetchall()
cur.execute(SQL2)
data=cur.fetchall()

#获取表头数据,省去一大堆中文
_=cur.description
columns1=[]
for x in _:
    columns1.append(x[0])
cur.close()#关闭连接

#将二维元组转换为DataFame结构,需注意的是二维元组((1,2),(2,3))无法组成DataFrame结构,需要先转换成List
data=DataFrame(list(data),columns=columns1)
data=data.set_index(data['year'])
data=data.drop('year',axis=1)
gdp=DataFrame(list(gdp))
gdp=gdp.set_index(gdp[0])
gdp=gdp.drop(0,axis=1)
gdp.rename(columns={1:'生产总值'},inplace=True)
#gdp=gdp.rename({1:'生产总值'},inplace=True) 此处注意,修改列名时必须指出columns

data['生产总值']=gdp #合并表格,将gdp表中的生产总值增添到data中,注:由于将index改为了年份,增添数据时,会自动对齐index


'''
第二种取数方法
直接用SQL语句预处理数据,减少取数
'''
'''
#从数据库中取数
示例#37
0
    def calculate_signals(self, df: pd.DataFrame, drop_extra_columns=True):
        n1 = self.n
        n2 = self.scale * n1

        df['median'] = df['close'].rolling(window=n2).mean()
        df['std'] = df['close'].rolling(n2, min_periods=1).std(
            ddof=0)  # ddof代表标准差自由度
        df['z_score'] = abs(df['close'] - df['median']) / df['std']
        df['m'] = df['z_score'].rolling(window=n2).mean()
        df['upper'] = df['median'] + df['std'] * df['m']
        df['lower'] = df['median'] - df['std'] * df['m']

        condition_long = df['close'] > df['upper']
        condition_short = df['close'] < df['lower']

        df['mtm'] = df['close'] / df['close'].shift(n1) - 1
        df['mtm_mean'] = df['mtm'].rolling(window=n1, min_periods=1).mean()

        # 基于价格atr,计算波动率因子wd_atr
        df['c1'] = df['high'] - df['low']
        df['c2'] = abs(df['high'] - df['close'].shift(1))
        df['c3'] = abs(df['low'] - df['close'].shift(1))
        df['tr'] = df[['c1', 'c2', 'c3']].max(axis=1)
        df['atr'] = df['tr'].rolling(window=n1, min_periods=1).mean()
        df['avg_price'] = df['close'].rolling(window=n1, min_periods=1).mean()
        df['wd_atr'] = df['atr'] / df['avg_price']

        # 参考ATR,对MTM指标,计算波动率因子
        df['mtm_l'] = df['low'] / df['low'].shift(n1) - 1
        df['mtm_h'] = df['high'] / df['high'].shift(n1) - 1
        df['mtm_c'] = df['close'] / df['close'].shift(n1) - 1
        df['mtm_c1'] = df['mtm_h'] - df['mtm_l']
        df['mtm_c2'] = abs(df['mtm_h'] - df['mtm_c'].shift(1))
        df['mtm_c3'] = abs(df['mtm_l'] - df['mtm_c'].shift(1))
        df['mtm_tr'] = df[['mtm_c1', 'mtm_c2', 'mtm_c3']].max(axis=1)
        df['mtm_atr'] = df['mtm_tr'].rolling(window=n1, min_periods=1).mean()

        # 参考ATR,对MTM mean指标,计算波动率因子
        df['mtm_l_mean'] = df['mtm_l'].rolling(window=n1, min_periods=1).mean()
        df['mtm_h_mean'] = df['mtm_h'].rolling(window=n1, min_periods=1).mean()
        df['mtm_c_mean'] = df['mtm_c'].rolling(window=n1, min_periods=1).mean()
        df['mtm_c1'] = df['mtm_h_mean'] - df['mtm_l_mean']
        df['mtm_c2'] = abs(df['mtm_h_mean'] - df['mtm_c_mean'].shift(1))
        df['mtm_c3'] = abs(df['mtm_l_mean'] - df['mtm_c_mean'].shift(1))
        df['mtm_tr'] = df[['mtm_c1', 'mtm_c2', 'mtm_c3']].max(axis=1)
        df['mtm_atr_mean'] = df['mtm_tr'].rolling(window=n1,
                                                  min_periods=1).mean()

        indicator = 'mtm_mean'

        # mtm_mean指标分别乘以三个波动率因子
        df[indicator] = 1e5 * df['mtm_atr'] * df['mtm_atr_mean'] * df[
            'wd_atr'] * df[indicator]

        # 对新策略因子计算自适应布林
        df['median'] = df[indicator].rolling(window=n1).mean()
        df['std'] = df[indicator].rolling(n1, min_periods=1).std(
            ddof=0)  # ddof代表标准差自由度
        df['z_score'] = abs(df[indicator] - df['median']) / df['std']
        # df['m'] = df['z_score'].rolling(window=n1).max().shift(1)
        # df['m'] = df['z_score'].rolling(window=n1).mean()
        df['m'] = df['z_score'].rolling(window=n1).min().shift(1)
        df['up'] = df['median'] + df['std'] * df['m']
        df['dn'] = df['median'] - df['std'] * df['m']

        # 突破上轨做多
        condition1 = df[indicator] > df['up']
        condition2 = df[indicator].shift(1) <= df['up'].shift(1)
        condition = condition1 & condition2
        df.loc[condition, 'signal_long'] = 1

        # 突破下轨做空
        condition1 = df[indicator] < df['dn']
        condition2 = df[indicator].shift(1) >= df['dn'].shift(1)
        condition = condition1 & condition2
        df.loc[condition, 'signal_short'] = -1

        # 均线平仓(多头持仓)
        condition1 = df[indicator] < df['median']
        condition2 = df[indicator].shift(1) >= df['median'].shift(1)
        condition = condition1 & condition2
        df.loc[condition, 'signal_long'] = 0

        # 均线平仓(空头持仓)
        condition1 = df[indicator] > df['median']
        condition2 = df[indicator].shift(1) <= df['median'].shift(1)
        condition = condition1 & condition2
        df.loc[condition, 'signal_short'] = 0

        df.loc[condition_long, 'signal_short'] = 0
        df.loc[condition_short, 'signal_long'] = 0

        # ===由signal计算出实际的每天持有仓位
        # signal的计算运用了收盘价,是每根K线收盘之后产生的信号,到第二根开盘的时候才买入,仓位才会改变。
        df['signal_short'].fillna(method='ffill', inplace=True)
        df['signal_long'].fillna(method='ffill', inplace=True)
        df['signal'] = df[['signal_long', 'signal_short'
                           ]].sum(axis=1, min_count=1,
                                  skipna=True)  # 若你的pandas版本是最新的,请使用本行代码代替上面一行
        temp = df[df['signal'].notnull()][['signal']]
        temp = temp[temp['signal'] != temp['signal'].shift(1)]
        df['signal'] = temp['signal']

        # df.drop(['signal_long', 'signal_short'], axis=1, inplace=True)
        df.drop([
            'mtm', 'mtm_l', 'mtm_h', 'mtm_c', 'atr', 'z_score', 'c1', 'c2',
            'c3', 'tr', 'avg_price', 'wd_atr', 'mtm_c3', 'mtm_tr', 'mtm_atr',
            'mtm_l_mean', 'mtm_h_mean', 'mtm_c_mean', 'mtm_atr_mean', 'mtm_c2',
            'mtm_c1'
        ],
                axis=1,
                inplace=True)
        return df
示例#38
0
# 0   26    Ken
# 1   29  Jerry

# 访问指定位置
print(df1.at[1, 'name'])
# Jerry

# 修改列名
df1.columns = ['Age', 'Name']
print(df1)
#    Age   Name
# 0   26    Ken
# 1   29  Jerry
# 2   24    Ben

#增加行,改变原df
df1.loc[len(df1)] = [24, 'qin']
print(df1)
#增加列,改变原df
df1['Sex'] = [1, 1, 2, 1]
print(df1)
#删除行,不改变原df
df2 =df1.drop(1, axis=0)
print(df1)
print(df2)

#删除列,不改变原df
df3 = df1.drop('Name', axis=1)
print(df1)
print(df3)
示例#39
0
 def check_timestamp(self, d: pd.DataFrame) -> pd.DataFrame:
     if all(d["timestamp"].isna()):
         d = d.drop(columns=["timestamp"]).reset_index(drop=True)
     return d
示例#40
0
 def _pca_transform(self, df: pd.DataFrame, n_components: int):
     pca = PCA(n_components)
     return pca.fit_transform(df.drop('Platform', axis=1).values)
示例#41
0
    def general_data_processing(self, X: DataFrame, X_test: DataFrame,
                                holdout_frac: float, num_bagging_folds: int):
        """ General data processing steps used for all models. """
        X = copy.deepcopy(X)
        # TODO: We should probably uncomment the below lines, NaN label should be treated as just another value in multiclass classification -> We will have to remove missing, compute problem type, and add back missing if multiclass
        # if self.problem_type == MULTICLASS:
        #     X[self.label] = X[self.label].fillna('')

        # Remove all examples with missing labels from this dataset:
        missinglabel_inds = [
            i for i, j in enumerate(X[self.label].isna()) if j
        ]
        if len(missinglabel_inds) > 0:
            logger.warning(
                f"Warning: Ignoring {len(missinglabel_inds)} (out of {len(X)}) training examples for which the label value in column '{self.label}' is missing"
            )
            X = X.drop(missinglabel_inds, axis=0)

        if self.problem_type is None:
            self.problem_type = self.get_problem_type(X[self.label])

        if X_test is not None and self.label in X_test.columns:
            # TODO: This is not an ideal solution, instead check if bagging and X_test exists with label, then merge them prior to entering general data processing.
            #  This solution should handle virtually all cases correctly, only downside is it might cut more classes than it needs to.
            self.threshold, holdout_frac, num_bagging_folds = self.adjust_threshold_if_necessary(
                X[self.label],
                threshold=self.threshold,
                holdout_frac=1,
                num_bagging_folds=num_bagging_folds)
        else:
            self.threshold, holdout_frac, num_bagging_folds = self.adjust_threshold_if_necessary(
                X[self.label],
                threshold=self.threshold,
                holdout_frac=holdout_frac,
                num_bagging_folds=num_bagging_folds)

        if (self.objective_func
                is not None) and (self.objective_func.name
                                  == 'log_loss') and (self.problem_type
                                                      == MULTICLASS):
            X = self.augment_rare_classes(X)

        # Gets labels prior to removal of infrequent classes
        y_uncleaned = X[
            self.label].copy()  # .astype('category').cat.categories

        self.cleaner = Cleaner.construct(problem_type=self.problem_type,
                                         label=self.label,
                                         threshold=self.threshold)
        # TODO: What if all classes in X are low frequency in multiclass? Currently we would crash. Not certain how many problems actually have this property
        X = self.cleaner.fit_transform(
            X)  # TODO: Consider merging cleaner into label_cleaner
        self.label_cleaner = LabelCleaner.construct(
            problem_type=self.problem_type,
            y=X[self.label],
            y_uncleaned=y_uncleaned)
        if (self.label_cleaner.num_classes
                is not None) and (self.label_cleaner.num_classes == 2):
            self.trainer_problem_type = BINARY
        else:
            self.trainer_problem_type = self.problem_type

        X, y = self.extract_label(X)
        y = self.label_cleaner.transform(y)

        if X_test is not None and self.label in X_test.columns:
            X_test = self.cleaner.transform(X_test)
            if len(X_test) == 0:
                logger.debug(
                    'All X_test data contained low frequency classes, ignoring X_test and generating from subset of X'
                )
                X_test = None
                y_test = None
            else:
                X_test, y_test = self.extract_label(X_test)
                y_test = self.label_cleaner.transform(y_test)
        else:
            y_test = None

        # TODO: Move this up to top of data before removing data, this way our feature generator is better
        if X_test is not None:
            # Do this if working with SKLearn models, otherwise categorical features may perform very badly on the test set
            logger.log(
                15,
                'Performing general data preprocessing with merged train & validation data, so validation performance may not accurately reflect performance on new test data'
            )
            X_super = pd.concat([X, X_test], ignore_index=True)
            X_super = self.feature_generator.fit_transform(
                X_super,
                banned_features=self.submission_columns,
                drop_duplicates=False)
            X = X_super.head(len(X)).set_index(X.index)
            X_test = X_super.tail(len(X_test)).set_index(X_test.index)
            del X_super
        else:
            X = self.feature_generator.fit_transform(
                X,
                banned_features=self.submission_columns,
                drop_duplicates=False)

        return X, y, X_test, y_test, holdout_frac, num_bagging_folds
示例#42
0
def variables_targets_split(
        data: pd.DataFrame,
        targets: list) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return data[targets], data.drop(available_targets, axis=1)
# In[2]:

X, y = make_blobs(n_samples=1000, centers=2, n_features=2)

# In[3]:

df = DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y))

# In[4]:

df

# In[5]:

X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.40,
                                                    random_state=42)

# # Printing the Dataset

# In[6]:

col = {0: 'red', 1: 'blue'}
fig, ax = plt.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=col[key])
        json_text = json.load(json_file)

messages = json_text['messages']
b = DataFrame(messages)
mydata = DataFrame(messages, columns=['from', 'type', 'text'])
indxs_drop = []
for ii in range(len(mydata)):
    if mydata['type'][ii] == 'service':
        indxs_drop = indxs_drop + [ii]
    elif type(mydata['text'].values[ii]) == list:
        #print(ii)
        mydata['text'].values[ii] = mydata['text'].values[ii][0]
        if type(mydata['text'].values[ii]) == dict:
            mydata['text'].values[ii] = mydata['text'].values[ii]['text']

mydata = mydata.drop(indxs_drop)
del mydata['type']

all_actors = mydata['from'].unique()
#%% engineer the text
import nltk
import string
import pattern
# Importing FreqDist library from nltk and passing token into FreqDist
from nltk.probability import FreqDist

my_stop_words = [
    "a", "abbastanza", "abbia", "abbiamo", "abbiano", "abbiate", "accidenti",
    "ad", "adesso", "affinché", "agl", "agli", "ahime", "ahimè", "ai", "al",
    "alcuna", "alcuni", "alcuno", "all", "alla", "alle", "allo", "allora",
    "altre", "altri", "altrimenti", "altro", "altrove", "altrui", "anche",
示例#45
0
class TestPivotTable(unittest.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.data = DataFrame({
            'A': [
                'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
                'foo', 'foo'
            ],
            'B': [
                'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two',
                'two', 'one'
            ],
            'C': [
                'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                'dull', 'shiny', 'shiny', 'shiny'
            ],
            'D':
            np.random.randn(11),
            'E':
            np.random.randn(11),
            'F':
            np.random.randn(11)
        })

    def test_pivot_table(self):
        rows = ['A', 'B']
        cols = 'C'
        table = pivot_table(self.data, values='D', rows=rows, cols=cols)

        table2 = self.data.pivot_table(values='D', rows=rows, cols=cols)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, rows)
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_table_nocols(self):
        df = DataFrame({
            'rows': ['a', 'b', 'c'],
            'cols': ['x', 'y', 'z'],
            'values': [1, 2, 3]
        })
        rs = df.pivot_table(cols='cols', aggfunc=np.sum)
        xp = df.pivot_table(rows='cols', aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(cols='cols', aggfunc={'values': 'mean'})
        xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T
        tm.assert_frame_equal(rs, xp)

    def test_pass_array(self):
        result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
        expected = self.data.pivot_table('D', rows='A', cols='C')
        tm.assert_frame_equal(result, expected)

    def test_pass_function(self):
        result = self.data.pivot_table('D',
                                       rows=lambda x: x // 5,
                                       cols=self.data.C)
        expected = self.data.pivot_table('D',
                                         rows=self.data.index // 5,
                                         cols='C')
        tm.assert_frame_equal(result, expected)

    def test_pivot_table_multiple(self):
        rows = ['A', 'B']
        cols = 'C'
        table = pivot_table(self.data, rows=rows, cols=cols)
        expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_dtypes(self):

        # can convert dtypes
        f = DataFrame({
            'a': ['cat', 'bat', 'cat', 'bat'],
            'v': [1, 2, 3, 4],
            'i': ['a', 'b', 'a', 'b']
        })
        self.assert_(f.dtypes['v'] == 'int64')

        z = pivot_table(f,
                        values='v',
                        rows=['a'],
                        cols=['i'],
                        fill_value=0,
                        aggfunc=np.sum)
        result = z.get_dtype_counts()
        expected = Series(dict(int64=2))
        tm.assert_series_equal(result, expected)

        # cannot convert dtypes
        f = DataFrame({
            'a': ['cat', 'bat', 'cat', 'bat'],
            'v': [1.5, 2.5, 3.5, 4.5],
            'i': ['a', 'b', 'a', 'b']
        })
        self.assert_(f.dtypes['v'] == 'float64')

        z = pivot_table(f,
                        values='v',
                        rows=['a'],
                        cols=['i'],
                        fill_value=0,
                        aggfunc=np.mean)
        result = z.get_dtype_counts()
        expected = Series(dict(float64=2))
        tm.assert_series_equal(result, expected)

    def test_pivot_multi_values(self):
        result = pivot_table(self.data,
                             values=['D', 'E'],
                             rows='A',
                             cols=['B', 'C'],
                             fill_value=0)
        expected = pivot_table(self.data.drop(['F'], axis=1),
                               rows='A',
                               cols=['B', 'C'],
                               fill_value=0)
        tm.assert_frame_equal(result, expected)

    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data,
                                     values=['D', 'E'],
                                     rows=['A', 'B'],
                                     cols='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data,
                                     values=['D', 'E'],
                                     rows=['A', 'B'],
                                     cols='C',
                                     aggfunc=func,
                                     margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({
            "a": ['R1', 'R2', nan, 'R4'],
            'b': ["C1", "C2", "C3", "C4"],
            "c": [10, 15, nan, 20]
        })
        result = df.pivot('a', 'b', 'c')
        expected = DataFrame([[nan, nan, nan, nan], [nan, 10, nan, nan],
                              [nan, nan, nan, nan], [nan, nan, 15, 20]],
                             index=Index(['R1', 'R2', nan, 'R4'], name='a'),
                             columns=Index(['C1', 'C2', 'C3', 'C4'], name='b'))
        tm.assert_frame_equal(result, expected)

    def test_margins(self):
        def _check_output(res, col, rows=['A', 'B'], cols=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(rows)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(cols)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D',
                                      rows=['A', 'B'],
                                      cols='C',
                                      margins=True,
                                      aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(rows=['A', 'B'],
                                      cols='C',
                                      margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(rows=['AA', 'BB'],
                                      margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(rows=['AA', 'BB'],
                                      margins=True,
                                      aggfunc='mean')

        # no rows
        rtable = self.data.pivot_table(cols=['AA', 'BB'],
                                       margins=True,
                                       aggfunc=np.mean)
        self.assert_(isinstance(rtable, Series))
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack
        from pandas.util.compat import product
        import datetime
        import pandas

        d = datetime.date.min
        data = list(
            product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                    [d + datetime.timedelta(i) for i in xrange(20)], [1.0]))
        df = pandas.DataFrame(data)
        table = df.pivot_table(values=4, rows=[0, 1, 3], cols=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', rows=['0', '1', '3'], cols=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)

    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame({
            'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
            'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
            'c': (['foo'] * 4 + ['bar'] * 4) * 2,
            'value': np.random.randn(16)
        })

        table = data.pivot_table('value', rows='a', cols=['b', 'c'])

        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
        tm.assert_frame_equal(table, expected)

    def test_pivot_columns_lexsorted(self):
        import datetime
        import numpy as np
        import pandas

        n = 10000

        dtype = np.dtype([
            ("Index", object),
            ("Symbol", object),
            ("Year", int),
            ("Month", int),
            ("Day", int),
            ("Quantity", int),
            ("Price", float),
        ])

        products = np.array([
            ('SP500', 'ADBE'),
            ('SP500', 'NVDA'),
            ('SP500', 'ORCL'),
            ('NDQ100', 'AAPL'),
            ('NDQ100', 'MSFT'),
            ('NDQ100', 'GOOG'),
            ('FTSE', 'DGE.L'),
            ('FTSE', 'TSCO.L'),
            ('FTSE', 'GSK.L'),
        ],
                            dtype=[('Index', object), ('Symbol', object)])
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items['Index'] = products['Index'][iproduct]
        items['Symbol'] = products['Symbol'][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1),
                               datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items['Year'] = dates.year
        items['Month'] = dates.month
        items['Day'] = dates.day
        items['Price'] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table('Price',
                                 rows=['Month', 'Day'],
                                 cols=['Index', 'Symbol', 'Year'],
                                 aggfunc='mean')

        self.assert_(pivoted.columns.is_monotonic)

    def test_pivot_complex_aggfunc(self):
        f = {'D': ['std'], 'E': ['sum']}
        expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
        result = self.data.pivot_table(rows='A', cols='B', aggfunc=f)

        tm.assert_frame_equal(result, expected)
示例#46
0
def filter_dataframe(df_raw: pd.DataFrame) -> pd.DataFrame:
    df_raw = df_raw.drop(['Hogwarts House'], axis=1)
    df = df_raw.select_dtypes([np.number])
    return df
示例#47
0
def plot_activity_hours(
    images: pd.DataFrame,
    names: Union[list, str, pd.Series],
    species_col: str = "scientific_name",
    remove_duplicates: bool = False,
    remove_duplicates_kws: dict = None,
    kind: str = "kde",
    hist_kws: dict = None,
    kde_kws: dict = None,
) -> matplotlib.axes.Axes:
    """
    Plots the activity hours of one or multiple species by grouping all
    observations into a 24-hour range.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    names : list, str or Series
        List of names to plot activity hours for.
    species_col : str
        Label of the scientific name column in the images DataFrame.
    remove_duplicates : bool
        Whether to remove duplicates. Wrapper for the
        wiutils.remove_duplicates function.
    remove_duplicates_kws : dict
        Keyword arguments for the wiutils.remove_duplicates function.
    kind : str
        Type of plot. Values can be:

        - 'hist' for histogram.
        - 'kde' for kernel density estimate plot.
    hist_kws : dict
        Keyword arguments passed to the seaborn.histplot() function. Only
        has effect if kind is 'hist'.
    kde_kws : dict
        Keyword arguments passed to the seaborn.kde() function. Only
        has effect if kind is 'kde'.

    Returns
    -------
    Axes
        Plot axes.

    """
    if isinstance(names, str):
        names = [names]

    if hist_kws is None:
        hist_kws = {}
    if kde_kws is None:
        kde_kws = {}

    inconsistent_names = set(names) - set(images[species_col])
    if len(inconsistent_names):
        raise ValueError(
            f"{list(inconsistent_names)} were not found in images.")

    images = images.copy()

    if remove_duplicates:
        images = _remove_wrapper(images,
                                 duplicates=True,
                                 duplicates_kws=remove_duplicates_kws)

    images = images.loc[images[species_col].isin(names), :].reset_index(
        drop=True)
    images[_labels.date] = pd.to_datetime(images[_labels.date])
    images["hour"] = images[_labels.date].dt.round("H").dt.hour
    images = images.drop(columns=_labels.date)

    if kind == "hist":
        ax = sns.histplot(
            data=images,
            x="hour",
            hue=species_col,
            binwidth=1,
            binrange=(-0.5, 23.5),
            discrete=False,
            **hist_kws,
        )
    elif kind == "kde":
        ax = sns.kdeplot(data=images, x="hour", hue=species_col, **kde_kws)
    else:
        raise ValueError("kind must be one of ['hist', 'kde']")

    ax.set_xlim(-1, 24)
    ax.set_xticks(range(0, 24, 2),
                  labels=[f"{h:02}:00" for h in range(0, 24, 2)])

    return ax
示例#48
0
def forecast(file, column):
    # Read the data to a DataFrame
    df = DataFrame()
    df = read_csv(file)

    # Delete remaining columns
    columns_list = list(df.columns.values)
    columns_list.remove(column)
    for i in range(len(columns_list)):
        df.drop(columns_list[i], axis=1, inplace=True)

    series = Series(list(df[column]), index=list(df.index))
    series.index.name = 'Data'

    # Split data into train and test-data frames
    df_test = df[(len(df) - 12):]
    df_train = df[:-12]
    y_true = list(df_test[column])

    # Transform data to be stationary
    raw_values = series.values
    print(type(raw_values))
    diff_values = difference(raw_values, 1)

    # Transform data to be supervised learning
    supervised = timeseries_to_supervised(diff_values, 1)
    supervised_values = supervised.values

    # Split data into train and test-sets
    train, test = supervised_values[0:-12], supervised_values[-12:]

    # Transform the scale of the data
    scaler, train_scaled, test_scaled = scale(train, test)

    # Building the model - parametrization:
    # 1. train set
    # 2. batch_size
    # 3. number_epochs
    # 4. number_neurons
    start = time.time()
    lstm_model = fit_lstm(train_scaled, 1, 250, 24)
    # Forecast the entire training dataset to build up state for forecasting
    train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1)
    lstm_model.predict(train_reshaped, batch_size=1)
    end = time.time()
    print("Execution time: " + str(end - start) + " s")

    # Walk-forward validation on the test data
    y_pred = list()
    for i in range(len(test_scaled)):
        # Make one-step forecast
        X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
        yhat = forecast_lstm(lstm_model, 1, X)
        # Invert scaling
        yhat = invert_scale(scaler, X, yhat)
        # Invert differencing
        yhat = inverse_difference(raw_values, yhat, len(test_scaled) + 1 - i)
        # Store forecast
        y_pred.append(yhat)
        expected = raw_values[len(train) + i + 1]
        print('Month=%d, Predicted=%f, Expected=%f' % (i + 1, yhat, expected))

    common_methods.model_evaluation(y_true, y_pred)

    common_methods.plot(column, df_train, y_true, y_pred)
示例#49
0
def plot_graphs(granularity: str) -> None:
    push_data_db = (
        test_scheduling.PUSH_DATA_GROUP_DB
        if granularity == "group"
        else test_scheduling.PUSH_DATA_CONFIG_GROUP_DB
    )
    assert db.download(push_data_db)

    regressions_by_rev = {}
    for revisions, _, _, possible_regressions, likely_regressions in db.read(
        push_data_db
    ):
        regressions_by_rev[revisions[0]] = get_regressions(
            granularity, likely_regressions, possible_regressions
        )

    scheduled_data = []
    caught_data = []

    for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB):
        if len(scheduler_stat["schedulers"]) == 0:
            continue

        if scheduler_stat["id"] not in regressions_by_rev:
            continue

        obj: dict[str, Any] = {
            "date": datetime.utcfromtimestamp(scheduler_stat["date"]),
        }

        for scheduler in scheduler_stat["schedulers"]:
            obj[scheduler["name"]] = len(get_scheduled(granularity, scheduler))

        scheduled_data.append(obj)

        regressions = regressions_by_rev[scheduler_stat["id"]]

        obj = {
            "date": datetime.utcfromtimestamp(scheduler_stat["date"]),
            "regressions": len(regressions),
        }

        for scheduler in scheduler_stat["schedulers"]:
            scheduled = get_scheduled(granularity, scheduler)

            obj[scheduler["name"]] = len(regressions & scheduled)

        caught_data.append(obj)

    scheduled_df = DataFrame(scheduled_data)
    scheduled_df.index = scheduled_df["date"]
    del scheduled_df["date"]

    caught_df = DataFrame(caught_data)
    caught_df.index = caught_df["date"]
    del caught_df["date"]

    df = scheduled_df.resample("W").mean()

    plot_graph(
        df,
        f"Average number of scheduled {granularity}s",
        f"average_{granularity}_scheduled.svg",
    )

    df = (
        caught_df[caught_df.regressions > 0]
        .drop(columns=["regressions"])
        .clip(0, 1)
        .resample("W")
        .mean()
    )

    plot_graph(
        df,
        "Percentage of regressing pushes where we caught at least one regression",
        f"percentage_{granularity}_caught_at_least_one.svg",
    )

    plot_graph(
        caught_df.drop(columns=["regressions"])
        .div(caught_df.regressions, axis=0)
        .resample("W")
        .mean(),
        "Percentage of regressions we caught",
        f"percentage_{granularity}_caught.svg",
    )
示例#50
0
copy  默认为True,无论如何都复制;如果为False,则新旧相等就不复制
'''
print(frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', copy=False))
print('\n5.2.2.丢弃指定轴上的项\n')
# 丢弃某条轴上的一个或多个项很简单,只要有一个索引数组或列表即可。
# 由于需要执行一些数据整理和集合逻辑,所以drop方法返回的是一个在指定轴上删除了指定值的新对象。
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
print(new_obj)
print(obj.drop(['c', 'd']))

# 对于DataFrame,可以删除任意轴上的索引值
data = DataFrame(np.arange(16).reshape(4, 4),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
print(data, data.drop(['Colorado', 'Ohio']),
      data.drop('two', axis=1), data.drop(['two', 'four'], axis=1),
      '', sep='\n')

print('\n5.2.3.索引、选取和过滤\n')
# Series索引(obj[...])的工作方式类似于NumPy数组的索引,只不过Series的索引值不是整数
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj['b'], obj[1], obj[2: 4], obj[['b', 'a', 'd']], obj[[1, 3]], obj[obj <2], sep='\n')

# 利用标签的切片运算与普通的Python切片运算不同,其末端是包含的
print(obj['b': 'c'])
obj['b', 'c'] = 5
print(obj)

# 对DataFrame进行索引其实就是获取一个或多个列
data = DataFrame(np.arange(16).reshape(4, 4),
示例#51
0
from pandas import Series, DataFrame

import google_auth as ga
import ranking

gc = ga.gauth()

# Spreadsheetを取得
SPREADSHEET_KEY = 'YOUR_SHEETID_HERE'  # Google Formsの出力先シートのID
worksheet = gc.open_by_key(SPREADSHEET_KEY).sheet1

# ---ここからデータ処理---
df = DataFrame(worksheet.get_all_values())
initial_columns_list = list(df.iloc[0, :])
df.columns = initial_columns_list
df.drop(0, inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df['氏名'] = df['学年'].astype(int)
df['練習回数'] = df['希望する練習回数'].astype(int)

days = ['月', '火', '水', '木', '金']
times = ['朝', '夜']
daytime_list = []

for day in days:
    for time in times:
        daytime_list.append(day + time)

for daytime in daytime_list:
    df[daytime] = 0
示例#52
0
def most_popular_wines(adjacency_matrix: pd.DataFrame) -> List[int]:
    most_popular = np.argsort(
        adjacency_matrix.drop("user_id", axis=1).sum(axis=0)).index
    return most_popular
示例#53
0
    def generate_new_datetime_features(self, X: pd.DataFrame) -> pd.DataFrame:
        '''
        Method generates new datetime features generated based on datetime features.

        Args:
            X: a dataset to add new features to

        Returns:
            Dataset with new generated features
        '''
        days_before_next_weekend = {0: 5, 1: 4, 2: 3, 3: 2, 4: 1, 5: 0, 6: 0}
        days_before_next_weekdays = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 2, 6: 1}
        month_to_season = dict(
            zip(range(1, 13), [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]))
        self.holidays = pd.to_datetime(pd.Series(ru_holidays))
        min_dt = X[self.dt_features].min().min()

        for col in self.dt_features:
            if self.task_type['is_timeseries'] or len(
                    self.f_spaces['datetime']) < 3:
                X[col] = X[col].fillna(datetime.datetime(1970, 1, 1))

                X['TS_{}_year'.format(col)] = X[col].apply(
                    lambda x: x.year).astype(np.int16)

                X['TS_{}_month'.format(col)] = X[col].apply(
                    lambda x: x.month).astype(np.int8)

                X['TS_{}_day'.format(col)] = X[col].apply(
                    lambda x: x.day).astype(np.int8)

                X['TS_{}_hour'.format(col)] = X[col].apply(
                    lambda x: x.hour).astype(np.int8)

                X['TS_{}_minute'.format(col)] = X[col].apply(
                    lambda x: x.minute).astype(np.int8)

                X['TS_{}_weekday'.format(col)] = \
                    X[col].apply(lambda x: x.weekday()).astype(np.int8)

                X['TS_{}_season'.format(col)] = \
                    X['TS_{}_month'.format(col)].map(month_to_season).astype(np.int8)

                X['TS_{}_hour_of_week'.format(col)] = \
                    X[col].apply(lambda x: x.weekday() * 24).astype(np.int16)

                X['TS_{}_is_holiday'.format(col)] = \
                    X[col].dt.date.isin(self.holidays.dt.date).astype(np.int8)

                X['TS_{}_is_weekend'.format(col)] = \
                    X['TS_{}_weekday'.format(col)].map({5: 1, 6: 1}).fillna(0).astype(np.int8)

                X['TS_{}_is_weekend'.format(col)] = (X['TS_{}_is_weekend'.format(col)] +\
                    X['TS_{}_is_holiday'.format(col)]).astype(np.int8)

                X['TS_{}_days_before_weekend'.format(col)] = \
                    X['TS_{}_weekday'.format(col)].map(days_before_next_weekend).astype(np.int8)

                year_month_func = lambda x: (x.year - 2010) * 12 + x.month
                X['TS_{}_year_month'.format(col)] = X[col].apply(
                    year_month_func).astype(np.int16)

        if self.verbose:
            n_created = len([col for col in X.columns if 'TS_' in col])
            print('FEATURE GENERATOR: {} timeseries datetime features created'.
                  format(n_created))

        for col1, col2 in self.new_datetime_features:
            X['new_datetime_diff' + col1 + '_' + col2] = \
                (X[col1] - X[col2]).astype('timedelta64[D]').fillna(-1).astype(np.int16)

        if self.verbose:
            n_created = len(self.new_datetime_features)
            print(
                'FEATURE GENERATOR: {} datetime diff features created'.format(
                    n_created))
        X.drop(self.f_spaces['datetime'], axis=1, inplace=True)
        return X
示例#54
0
 def _clean(df: pd.DataFrame) -> pd.DataFrame:
     """Ensure no duplicates and ascending sorting of diven df."""
     df = df.sort_index(ascending=True)
     df.drop(index=df[df.index.duplicated()].index, inplace=True)
     return df
示例#55
0
 def pipe_add_metadata(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.drop(columns="Region").assign(location=self.location,
                                             source_url=self.source_url_ref)
示例#56
0
def test_multiple_date_col_custom(all_parsers, keep_date_col):
    data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
    parser = all_parsers

    def date_parser(*date_cols):
        """
        Test date parser.

        Parameters
        ----------
        date_cols : args
            The list of data columns to parse.

        Returns
        -------
        parsed : Series
        """
        return parsing.try_parse_dates(parsers._concat_date_cols(date_cols))

    result = parser.read_csv(StringIO(data),
                             header=None,
                             date_parser=date_parser,
                             prefix="X",
                             parse_dates={
                                 "actual": [1, 2],
                                 "nominal": [1, 3]
                             },
                             keep_date_col=keep_date_col)
    expected = DataFrame([
        [
            datetime(1999, 1, 27, 19, 0),
            datetime(1999, 1, 27, 18, 56), "KORD", "19990127", " 19:00:00",
            " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0
        ],
        [
            datetime(1999, 1, 27, 20, 0),
            datetime(1999, 1, 27, 19, 56), "KORD", "19990127", " 20:00:00",
            " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0
        ],
        [
            datetime(1999, 1, 27, 21, 0),
            datetime(1999, 1, 27, 20, 56), "KORD", "19990127", " 21:00:00",
            " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0
        ],
        [
            datetime(1999, 1, 27, 21, 0),
            datetime(1999, 1, 27, 21, 18), "KORD", "19990127", " 21:00:00",
            " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0
        ],
        [
            datetime(1999, 1, 27, 22, 0),
            datetime(1999, 1, 27, 21, 56), "KORD", "19990127", " 22:00:00",
            " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0
        ],
        [
            datetime(1999, 1, 27, 23, 0),
            datetime(1999, 1, 27, 22, 56), "KORD", "19990127", " 23:00:00",
            " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0
        ],
    ],
                         columns=[
                             "actual", "nominal", "X0", "X1", "X2", "X3", "X4",
                             "X5", "X6", "X7", "X8"
                         ])

    if not keep_date_col:
        expected = expected.drop(["X1", "X2", "X3"], axis=1)
    elif parser.engine == "python":
        expected["X1"] = expected["X1"].astype(np.int64)

    # Python can sometimes be flaky about how
    # the aggregated columns are entered, so
    # this standardizes the order.
    result = result[expected.columns]
    tm.assert_frame_equal(result, expected)
def modify_df(df1: pd.DataFrame, name):
    df1.rename(columns={'value': name}, inplace=True)
    df1.drop(columns=['unit'], inplace=True)

    return df1
示例#58
0
def parallel_extraction(df: pd.DataFrame, df_images: pd.DataFrame,
                        df_sources: pd.DataFrame, min_sigma: float,
                        edge_buffer: float, cluster_threshold: float,
                        allow_nan: bool, add_mode: bool,
                        p_run_path: str) -> pd.DataFrame:
    """
    Parallelize forced extraction with Dask

    Args:
        df:
            dataframe with columns 'wavg_ra', 'wavg_dec', 'img_diff',
            'detection'
        df_images:
            dataframe with the images data and columns 'id',
            'measurements_path', 'path', 'noise_path', 'beam_bmaj',
            'beam_bmin', 'beam_bpa', 'background_path', 'rms_min', 'datetime',
            'skyreg__centre_ra', 'skyreg__centre_dec', 'skyreg__xtr_radius'
            and 'name' as the index.
        df_sources:
            dataframe derived from the measurement data with columns 'source',
            'image', 'flux_peak'.
        min_sigma:
            minimum sigma value to drop forced extracted measurements.
        edge_buffer:
            flag to pass to ForcedPhot.measure method.
        cluster_threshold:
            flag to pass to ForcedPhot.measure method.
        allow_nan:
            flag to pass to ForcedPhot.measure method.
        add_mode:
            True when the pipeline is running in add image mode.
        p_run_path:
            The system path of the pipeline run output.

    Returns:
        Dataframe with forced extracted measurements data, columns are
        'source_tmp_id', 'ra', 'dec', 'image', 'flux_peak', 'island_id',
        'component_id', 'name', 'flux_int', 'flux_int_err'
    """
    # explode the lists in 'img_diff' column (this will make a copy of the df)
    out = (
        df.rename(columns={
            'img_diff': 'image',
            'source': 'source_tmp_id'
        })
        # merge the rms_min column from df_images
        .merge(df_images[['rms_min']],
               left_on='image',
               right_on='name',
               how='left').rename(columns={'rms_min': 'image_rms_min'})
        # merge the measurements columns 'source', 'image', 'flux_peak'
        .merge(df_sources,
               left_on=['source_tmp_id', 'detection'],
               right_on=['source', 'image'],
               how='left').drop(columns=['image_y', 'source']).rename(
                   columns={'image_x': 'image'}))

    # drop the source for which we would have no hope of detecting
    predrop_shape = out.shape[0]
    out['max_snr'] = out['flux_peak'].values / out['image_rms_min'].values
    out = out[out['max_snr'] > min_sigma].reset_index(drop=True)
    logger.debug("Min forced sigma dropped %i sources",
                 predrop_shape - out.shape[0])

    # drop some columns that are no longer needed and the df should look like
    # out
    # |   | source_tmp_id | wavg_ra | wavg_dec | image_name       | flux_peak |
    # |--:|--------------:|--------:|---------:|:-----------------|----------:|
    # | 0 |            81 | 317.607 | -8.66952 | VAST_2118-06A... |    11.555 |
    # | 1 |           894 | 323.803 | -2.6899  | VAST_2118-06A... |     2.178 |
    # | 2 |          1076 | 316.147 | -3.11408 | VAST_2118-06A... |     6.815 |
    # | 3 |          1353 | 322.094 | -4.44977 | VAST_2118-06A... |     1.879 |
    # | 4 |          1387 | 321.734 | -6.82934 | VAST_2118-06A... |     1.61  |

    out = (out.drop(['max_snr', 'image_rms_min', 'detection'],
                    axis=1).rename(columns={'image': 'image_name'}))

    # get the unique images to extract from
    unique_images_to_extract = out['image_name'].unique().tolist()
    # create a list of dictionaries with image file paths and dataframes
    # with data related to each images
    image_data_func = lambda x: {
        'image': df_images.at[x, 'path'],
        'background': df_images.at[x, 'background_path'],
        'noise': df_images.at[x, 'noise_path'],
        'df': out[out['image_name'] == x]
    }
    list_to_map = list(map(image_data_func, unique_images_to_extract))
    # create a list of all the measurements parquet files to extract data from,
    # such as prefix and max_id
    list_meas_parquets = list(
        map(lambda el: df_images.at[el, 'measurements_path'],
            unique_images_to_extract))
    del out, unique_images_to_extract, image_data_func

    # get a map of the columns that have a fixed value
    mapping = (db.from_sequence(list_meas_parquets,
                                npartitions=len(list_meas_parquets)).map(
                                    get_data_from_parquet, p_run_path,
                                    add_mode).compute())
    mapping = pd.DataFrame(mapping)
    # remove not used columns from images_df and merge into mapping
    col_to_drop = list(
        filter(lambda x: ('path' in x) or ('skyreg' in x),
               df_images.columns.values.tolist()))
    mapping = (mapping.merge(df_images.drop(col_to_drop, axis=1).reset_index(),
                             on='id',
                             how='left').drop('rms_min',
                                              axis=1).set_index('name'))
    del col_to_drop

    n_cpu = cpu_count() - 1
    bags = db.from_sequence(list_to_map, npartitions=len(list_to_map))
    forced_dfs = (bags.map(
        lambda x: extract_from_image(edge_buffer=edge_buffer,
                                     cluster_threshold=cluster_threshold,
                                     allow_nan=allow_nan,
                                     **x)).compute())
    del bags
    # create intermediates dfs combining the mapping data and the forced
    # extracted data from the images
    intermediate_df = list(
        map(lambda x: {
            **(mapping.loc[x['image'], :].to_dict()),
            **x
        }, forced_dfs))

    # compute the rest of the columns
    intermediate_df = (db.from_sequence(intermediate_df).map(
        lambda x: finalise_forced_dfs(**x)).compute())
    df_out = (pd.concat(intermediate_df, axis=0,
                        sort=False).rename(columns={
                            'wavg_ra': 'ra',
                            'wavg_dec': 'dec',
                            'image_name': 'image'
                        }))

    return df_out
示例#59
0
def ens_mouse_to_ens_human(df_unmapped: pd.DataFrame, drop_unmapped: bool=False, verbose: bool=False) -> None:
    """
    Maps mouse ensembl gene id's to human ensembl gene id's.

    Args:
        df_unmapped:    a dataframe in tidy-format.
        drop_unmapped:  True: remove unmapped genes (rows) from df, False: keep original index
        verbose:        explicitly print status or not

    Returns:
        None
    
    Todo:
        * modify drop_unmapped to unmapped: {"drop", "keep", "na"}
        * make one mapping-function for all cases
        * support for custom mapping file
        * handle case for empty df
    """
    assert (len(df_unmapped) > 0), "Empty dataframe."
    
    PREFIX = "ENSMUSG"
    
    if verbose:
        print("Mapping: mouse ensembl gene id's --> human ensembl gene id's ...")
    
    # Check that genes are correct format
    mask_peek = np.array([PREFIX in str(idx) for idx in df_unmapped.index.values])

    if not (mask_peek.any()):
        print("Dataframe index contains values that are not ensemble format or not mouse ensembl id: ", df_unmapped.index.values[mask_peek])
    resource_package = __name__
    resource_path = 'maps/hsapiens_mmusculus_unique_orthologs.GRCh37.ens_v91.txt.gz'  # Do not use os.path.join()
    resource_stream = pkg_resources.resource_stream(resource_package, resource_path)    
    df_map = pd.read_csv(resource_stream, compression='gzip', delim_whitespace=True)
    # create dictionary for mapping mouse ensemble gene id's to human ensembl gene id's
    map_dict = dict(zip(df_map["mmusculus_homolog_ensembl_gene"].ravel(), \
                            df_map["ensembl_gene_id"].ravel()))
    
    # map genes in-place,
    # i.e. indexes are replaced directly in df
    df_unmapped.rename(index=map_dict, inplace=True)

    if verbose or drop_unmapped:
        # check for unmapped genes
        # note the tilde ~ to get genes NOT mapped
        mask_unmapped = ~df_unmapped.index.isin(df_map["ensembl_gene_id"])
        label_unmapped = df_unmapped.index.values[mask_unmapped]
    
        # create report
        n_unmapped = len(label_unmapped)
        
        if verbose:
            n_total = len(df_unmapped)
            pct = n_unmapped / n_total * 100
            print("%.2f pct of genes are unmapped ..." % pct)
        
        if drop_unmapped:
            df_unmapped.drop(index=label_unmapped, inplace=True)
            n_mapped = len(df_unmapped)
            if verbose:
                print("Removed {} unmapped genes ...".format(n_unmapped))
    
    return None
def get_data():
    df = DataFrame()
    df = (ts.get_hist_data('hs300',
                           start='2013-01-01',
                           end='2017-01-01',
                           ktype='D'))[index[:6] + ['price_change']]
    df = df.sort_index()
    #ReturnRate=ln(s(t)/s(t-1))  lag=1
    df.insert(0, 'ReturnRate', df['close'])
    temp = 1
    for i in df.index:
        df['ReturnRate'][i] = np.log(df['close'][i] / temp)
        temp = df['close'][i]

    #alpha#6 lag=10
    df.insert(7, 'alpha#6', df['open'])
    for i in xrange(10, len(df.index)):
        df['alpha#6'][i] = np.corrcoef(df['open'][i - 10:i],
                                       df['volume'][i - 10:i])[0][1]

    #alpha#23 lag=20
    df.insert(8, 'alpha#23', df['high'])
    #过去20天最高价的均值
    df['alpha#23'][20] = df['high'][:20].sum() / 20.0
    for i in xrange(21, len(df.index)):
        df['alpha#23'][i] = (20 * df['alpha#23'][i - 1] - df['high'][i - 21] +
                             df['high'][i - 1]) / 20.0
    '''
    plot(df['alpha#23'],df['date'],label='20_high_avg')
    plot(df['high'],df['date'],label='high')
    '''
    for i in xrange(20, len(df.index)):
        if df['high'][i] > df['alpha#23'][i]:  #今日高于过去20天平均,呈上涨趋势
            df['alpha#23'][i] = -1 * (df['high'][i - 2] - df['high'][i])
        else:
            df['alpha#23'][i] = 0
    '''
    plot(df['alpha#23'],df['date'],label='alpha#23')
    legend(loc='upper left')
    show()
    '''
    #alpha#28 lag=5
    df.insert(9, 'alpha#28', df['high'])
    temp = 0
    for i in xrange(5, len(df.index)):
        df['alpha#28'][i] = np.corrcoef(
            df['v_ma20'][i - 5:i], df['low'][i - 5:i])[0][1] + (
                df['high'][i] + df['low'][i]) / 2.0 - df['close'][i]

    temp = abs(df['alpha#28'][20:]).sum()
    for i in xrange(lag, len(df.index)):
        df['alpha#28'][i] = df['alpha#28'][i] / temp

    #alpha#54 lag=0
    df.insert(10, 'alpha#54', df['high'])
    for i in xrange(len(df.index)):
        df['alpha#54'][i] = (
            -1 * (df['low'][i] - df['close'][i]) * pow(df['open'][i], 5)) / (
                (df['low'][i] - df['high'][i]) * pow(df['close'][i], 5))

    #alpha#101 lag=0
    df.insert(11, 'alpha#101', df['high'])
    for i in xrange(len(df.index)):
        df['alpha#101'][i] = (df['close'][i] - df['open'][i]) / (
            df['high'][i] - df['low'][i] + 0.001)

    df = df[lag:]

    #绘图
    if True:
        plot(df['alpha#6'], label='alpha#6')
        #plot(df['alpha#23'],label='alpha#23')
        plot(df['alpha#28'], label='alpha#28')
        plot(df['alpha#54'], label='alpha#54')
        plot(df['alpha#101'], label='alpha#101')
        legend(loc='upper left')
        show()

    df = df.drop(['price_change'], axis=1)
    output = open('raw_data.pkl', 'wb')
    pickle.dump(df, output)
    pickle.dump(index, output)
    output.close()