def plots_workingTrends(): # holiday = 0 and workday = 0 => weekend # let's see if holidays and weekends give the same trends # Day trends -- working vs. non-working day hours = np.linspace(0,23,24) days_average = DataFrame({'Hour': hours}) # workdays mean_vec = [] for hour in hours: mean_vec.append(bike_data[ (bike_data["workingday"] == 1) & (bike_data["time"] == hour) ].mean()['count']) days_average = days_average.join(DataFrame({'Working day': mean_vec})) # holidays or weekends mean_vec = [] for hour in hours: mean_vec.append(bike_data[ (bike_data["workingday"] == 0) & (bike_data["time"] == hour) ].mean()['count']) days_average = days_average.join(DataFrame({'Non-working day': mean_vec})) days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16) plt.xlabel('Hour', fontsize=16) plt.ylabel('Average counts', fontsize=16) plt.legend(loc='best', fontsize=16) plt.show()
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected)
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr): ''' Meant to feed into a Pivot requested by Mitch Turner. Aggregates the same as above but includes time and product data. ''' dat = pwunsale_tidy['Date'].tolist() pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat] print('Aggregating custom pivot for Mitch.') len_unique = lambda x: len(pd.unique(x)) agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum}, 'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum}, 'Invoice':len_unique } custom_cols = ['Month','CustomerId','Customer','ProductId','Product'] customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False) customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) customer_returns.drop('Customer', inplace=True, axis=1) print('Merging in YTD sales by Customer') customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left') print('Deriving returns as a percent of sales for each Customer.') customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer']) print('Merge in customer attributes.') customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left') print('Sorting in descending order on Dollars returned.') customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True) return customer_returns
def datatype_records_to_subset_and_migrate(likechars): stmt_for_pkeys = conn_popler_2.execute( select( from_obj=Maintable, columns=[ column('lter_proj_site'), column('samplingprotocol') ]). where( column('samplingprotocol').like( '%{}%'.format(likechars)) ) ) data = DataFrame(stmt_for_pkeys.fetchall()) data.columns = stmt_for_pkeys.keys() records_to_get = data['lter_proj_site'].values.tolist() stmt_for_records = conn_popler_2.execute( select( from_table=Rawtable, ). where(column('lter_proj_site').in_(records_to_get)). order_by('sampleid') ) data2 = DataFrame(stmt_for_records.fetchall()) data2.columns = stmt_for_records.keys() data2.drop('individ', axis=1, inplace=True)
def test_mixed_depth_drop(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df.drop('a',axis=1) expected = df.drop([('a','','')],axis=1) assert_frame_equal(expected, result) result = df.drop(['top'],axis=1) expected = df.drop([('top','OD','wx')], axis=1) expected = expected.drop([('top','OD','wy')], axis=1) assert_frame_equal(expected, result) result = df.drop(('top', 'OD', 'wx'), axis=1) expected = df.drop([('top','OD','wx')], axis=1) assert_frame_equal(expected, result) expected = df.drop([('top','OD','wy')], axis=1) expected = df.drop('top', axis=1) result = df.drop('result1', level=1, axis=1) expected = df.drop([('routine1', 'result1', ''), ('routine2', 'result1', '')], axis=1) assert_frame_equal(expected, result)
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None): from pandas import DataFrame data = DataFrame(data, dtype=dtype) names = data.columns if isinstance(endog_idx, int): endog_name = names[endog_idx] endog = data[endog_name] if exog_idx is None: exog = data.drop([endog_name], axis=1) else: exog = data.filter(names[exog_idx]) else: endog = data.ix[:, endog_idx] endog_name = list(endog.columns) if exog_idx is None: exog = data.drop(endog_name, axis=1) elif isinstance(exog_idx, int): exog = data.filter([names[exog_idx]]) else: exog = data.filter(names[exog_idx]) exog_name = list(exog.columns) dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog, endog_name=endog_name, exog_name=exog_name) return dataset
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None, index_idx=None): data = DataFrame(data, dtype=dtype) names = data.columns if isinstance(endog_idx, (int, long)): endog_name = names[endog_idx] endog = data[endog_name] if exog_idx is None: exog = data.drop([endog_name], axis=1) else: exog = data.filter(names[exog_idx]) else: endog = data.loc[:, endog_idx] endog_name = list(endog.columns) if exog_idx is None: exog = data.drop(endog_name, axis=1) elif isinstance(exog_idx, (int, long)): exog = data.filter([names[exog_idx]]) else: exog = data.filter(names[exog_idx]) if index_idx is not None: # NOTE: will have to be improved for dates endog.index = Index(data.iloc[:, index_idx]) exog.index = Index(data.iloc[:, index_idx]) data = data.set_index(names[index_idx]) exog_name = list(exog.columns) dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog, endog_name=endog_name, exog_name=exog_name) return dataset
def clicksDataframe(clicks_data): clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks']) clicks_dataframe = clicks_dataframe.apply(to_numeric, errors='ignore') clicks_dataframe.drop('date', axis=1, inplace=True) clicks_dataframe = clicks_dataframe.groupby(['cardName','position']).sum().sort_values(by='uniqueClicks',ascending=0) clicks_dataframe.reset_index(inplace=True) return clicks_dataframe
def scale_features(df: DataFrame): spec_features = ['Fare'] scaler = StandardScaler() for sf in spec_features: scale_param = scaler.fit(df[sf].reshape(-1, 1)) df[sf + '_scaled'] = scaler.fit_transform(df[sf].reshape(-1, 1), scale_param) df.drop(labels=spec_features, axis=1, inplace=True) return df
def gonzales(data , k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1,k+1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index ]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1 ,inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0 ,1])
def set_dummy_vars(df: DataFrame): df.drop(labels=['Name'], axis=1, inplace=True) discrete_features = list(df.dtypes[df.dtypes == 'object'].index) discrete_features.append('Pclass') dummies = [pd.get_dummies(df[f], prefix=f) for f in discrete_features] dummies.insert(0, df) df = pd.concat(dummies, axis=1) df.drop(labels=discrete_features, axis=1, inplace=True) return df
def generateGraphData(self): safePrint('Generating and uploading data files') allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True) xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00'] # filter type==DATA and site==wikipedia allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')] # By "iszero+via", e.g. a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum)) data.reset_index(inplace=True) data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1) data.drop('iszero', axis=1, inplace=True) self.createClippedData('RawData:YearDailyViaIsZero', data) self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly) self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly) allowedSubdomains = ['m', 'zero'] data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))] data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum)) data.reset_index(inplace=True) self.createClippedData('RawData:YearDailySubdomains', data) self.createPeriodData('RawData:WeeklySubdomains', data, weekly) self.createPeriodData('RawData:MonthlySubdomains', data, monthly) # create an artificial yes/no/opera sums opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')] opera['str'] = 'o' yes = allData[allData.iszero == 'y'] yes['str'] = 'y' no = allData[allData.iszero == 'n'] no['str'] = 'n' combined = opera.append(yes).append(no) data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum)) data.reset_index(inplace=True) headerFields = 'date,xcs,iszero,count' # Override "str" as "iszero" self.createClippedData('RawData:YearDailyTotals', data, headerFields) self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields) data = [] for xcsId in list(allData.xcs.unique()): byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \ .order('count', ascending=False) top = byLang.head(5) vals = list(top.iteritems()) vals.append(('other', byLang.sum() - top.sum())) valsTotal = sum([v[1] for v in vals]) / 100.0 data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals]) self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
def _one_hot_encoding(df: pd.DataFrame, features: list) -> pd.DataFrame: """ help method for one hot encoding """ for feature in features: one_hot = pd.get_dummies(df[feature], feature, '_') # And the next two statements 'replace' the existing feature_selection by the new binary-valued features # First, drop the existing column df.drop(feature, axis=1, inplace=True) # Next, concatenate the new columns. This assumes no clash of column names. df = pd.concat([df, one_hot], axis=1) return df
def data_prep(input_file, bad_samples_file, freq_dict=None): '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py''' min_snpD = 10 tri_allele= 0 output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt' #relaxing conditions because we only have 3000 SNPs to begin with bad_samples = [sample.strip() for sample in open(bad_samples_file)] df = DataFrame(read_csv(input_file, sep = '\t')) #remove bad samples df.drop(bad_samples, inplace = True, axis =1) #remove non-biallelic alleles #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True) #relaxing conditions because we only have 3000 SNPs to begin with '''#remove SNPs that are too close to one another df['diff'] = df.groupby('chrom')['pos'].diff() df.fillna('first', inplace = True) #df.to_csv('test_df.txt', sep = '\t') # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM df = df.query('diff > 10 or diff == "first"') df.drop('diff', axis = 1, inplace = True)''' if not freq_dict: #calculate the major and minor allele major = df.apply(major_find, axis =1 ) minor = df.apply(minor_find, axis =1 ) major_prop = df.apply(major_prop_find, axis =1 ) minor_prop = df.apply(minor_prop_find, axis = 1) else: snp_dict = json.load(open(freq_dict)) df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str) major = df['keys'].apply(lambda x : snp_dict[x]['major']) major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq']) minor = df['keys'].apply(lambda x : snp_dict[x]['minor']) minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq']) df.drop('keys', inplace= True, axis = 1) #inserting this stuff into dataframe for future use df.insert(3, 'minor_prop', minor_prop) df.insert(3, 'minor', minor) df.insert(3, 'major_prop', major_prop) df.insert(3, 'major', major) df.to_csv(output_file, sep = '\t', index= False) return df
def homePageToSubjectPageDataframe(data): subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews']) subject_dataframe = subject_dataframe.apply(to_numeric, errors='ignore') subject_dataframe.drop('date', axis=1, inplace=True) subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort_values(by='uniqueViews',ascending=0) subject_dataframe.reset_index(inplace=True) subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title)) subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum() subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews']) subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)] return subject_dataframe[['subject','uniqueViews','Pct']]
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50): exclude_words, duplicate_sets = load_filter_tags(exclude_tags) with open(tag_pickle, 'r') as f: t = DataFrame(pickle.load(f)['result']).set_index('_id') for setn in duplicate_sets: t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:])) for tag in setn[1:]: t.drop(tag, inplace=True) for tag in exclude_words: t.drop(tag, inplace=True) t.sort(ascending=False) return t[:n].index
def pd_dataframe6(): obj=DataFrame(np.arange(5.),index=['a','b','c','d','e']) print obj new_obj=obj.drop('c') print new_obj print obj.drop(['b','c']) data = DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four']) print data print data.drop(['Ohio', 'Colorado']) print data.drop('two',axis=1) print data.drop(['two','four'],axis=1)
def load_velocities(self, unit=None): """ Load Particle Velocities in units of km/s (default set in units class) unit: unit conversion from code units """ if unit: self.units.set_velocity(unit) uvw = self._velocities.value * self.units.velocity_conv if self.units.coordinate_system == 'physical': a = self._header.ScaleFactor uvw *= numpy.sqrt(a) uvw = DataFrame(uvw, index=self._particleIDs.value, columns=['u', 'v', 'w']) if self._drop_ids is not None: uvw.drop(self._drop_ids, inplace=True) self[['u', 'v', 'w']] = uvw
def dataset_transformation(df : pd.DataFrame, testData : pd.DataFrame): #Age Transform df['AgeuponOutcome'] = df['AgeuponOutcome'].apply(calculateAge) testData['AgeuponOutcome'] = testData['AgeuponOutcome'].apply(calculateAge) #name transform df['Name'] = df['Name'].apply(processName) testData['Name'] = testData['Name'].apply(processName) #df = df.apply(setMissingAge, axis=1) #testData = testData.apply(setMissingAge, axis=1) #encodeFeature(df,'OutcomeType') #Animal Tpye transform le = encodeFeature(df,testData,'AnimalType') #sex transform le = encodeFeature(df,testData,'SexuponOutcome') le = encodeFeature(df,testData,'SexuponOutcome1') #Breed transform le = encodeFeature(df,testData,'Breed') le = encodeFeature(df, testData, 'Breed1') le = encodeFeature(df, testData, 'Breed2') #color. le = encodeFeature(df,testData,'Color') le = encodeFeature(df, testData, 'Color1') le = encodeFeature(df, testData, 'Color2') #encodeFeature(df,'Breed1') #encodeFeature(df,'Breed2') #encodeFeature(df,'Breedcount') #encodeFeature(df,'Name') df = df.drop(['DateTime'], axis=1) testData = testData.drop(['DateTime'], axis=1) df = df.drop(['Name'], axis=1) testData = testData.drop(['Name'], axis=1) return [df,testData]
def thread_participation_evolution( pm_frame, project, n=2, skip_anon=True, research_only=False): """Assembles data on participation to threads in project with n as thresh. Returns DataFrame, index, selection and title for data for use by stacked bar-plot and heatmap functions.""" if not research_only: thread_type = 'all threads' title = "Participation per thread in {} (threshold = {})".format( project, n) else: thread_type = 'research threads' title = "Participation per thread in {}\ (threshold = {}, only research-threads)".format(project, n) data = pm_frame.loc[project][['basic', thread_type]] data = data.dropna() all_authors = set().union(*data[thread_type, 'authors']) author_thread = DataFrame(columns=all_authors) for author in author_thread.columns: author_thread[author] = data[thread_type, 'authors'].apply( lambda thread, author=author: author in thread) author_thread = author_thread.T author_thread = author_thread.sort_values(by=data.index.tolist(), ascending=False) author_thread = author_thread.drop( "Anonymous") if skip_anon else author_thread author_thread.columns.name = "Threads" select = author_thread.sum(axis=1) >= n return author_thread, data.index, select, title
def project_participation_evolution( pm_frame, all_authors, n=2, skip_anon=True, research_only=False): """Assembles data on participation to projects with n as thresh. Returns DataFrame, index, selection and title for data for use by stacked bar-plot and heatmap functions.""" if not research_only: thread_type = 'all threads' data, _ = get_last(pm_frame, thread_type) all_authors = list(all_authors) title = "Participation per project in Polymath\ (threshold = {})".format(n) else: thread_type = 'research threads' data, _ = get_last(pm_frame, thread_type) all_authors = set().union( *data['research threads', 'authors (accumulated)']) title = "Participation per project in Polymath\ (threshold = {}, only research-threads)".format(n) data.index = data.index.droplevel(1) author_project = DataFrame(columns=all_authors) for author in author_project.columns: author_project[author] = data[ thread_type, 'authors (accumulated)'].apply( lambda project, author=author: author in project) author_project = author_project.T author_project = author_project.sort_values(by=data.index.tolist(), ascending=False) author_project = author_project.drop( "Anonymous") if skip_anon else author_project select = author_project.sum(axis=1) >= n return author_project, data.index, select, title
def test_two_isolated_steppers_one_gapped(self): N = 5 Y = 25 # Begin second feature one frame later than the first, so the probe labeling (0, 1) is # established and not arbitrary. a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)}) a = a.drop(3).reset_index(drop=True) b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)}) f = pd.concat([a, b]) actual = self.link(f, 5) expected = f.copy() expected['probe'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)]) expected.sort(['probe', 'frame'], inplace=True) expected.reset_index(drop=True, inplace=True) assert_frame_equal(actual, expected) # Sort rows by frame (normal use) actual = self.link(f.sort('frame'), 5) assert_frame_equal(actual, expected) # Shuffle rows (crazy!) np.random.seed(0) f1 = f.reset_index(drop=True) f1.reindex(np.random.permutation(f1.index)) actual = self.link(f1, 5) assert_frame_equal(actual, expected)
def training(iden, Charg, Temps, use_cache_trainingset, test, verbose): ''' Return the prediction function, for a given site iden, history Charg and temperature Temps''' if use_cache_trainingset: if test: X = pickle.load(open(CACHE_DIR+"X_test_"+iden+".p", "rb")) else: X = pickle.load(open(CACHE_DIR+"X_"+iden+".p", "rb")) else: X = DataFrame(Charg[iden]) X = X.dropna(how='any') X['dayofweek'] = X.index.dayofweek X['Temps'] = Temps[iden].ix[X.index] X['fracday'] = X.index.minute/60.+X.index.hour X['lastminutes'] = X[iden].ix[X.index-10*Minute()].values X['yesterday'] = X[iden].ix[X.index-Day()].values X['yesterdaybis'] = X[iden].ix[X.index-Day()-10*Minute()].values X['lastweek'] = X[iden].ix[X.index-Week()].values X['lastweekbis'] = X[iden].ix[X.index-Week()-10*Minute()].values if test: pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) ) else: pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) ) X = X.dropna(how='any') y = X[iden] X = X.drop(iden, 1) scalerX = preprocessing.StandardScaler().fit(X) ############################## clf = linear_model.SGDRegressor(alpha = 0.000001,n_iter=3000) ############################## clf.fit(scalerX.transform(X), y) if verbose: print('Function for '+iden+' computed.') return(lambda x :clf.predict(scalerX.transform(x)))
def test_cythonized_aggers(op_name): data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan], 'B': ['A', 'B'] * 6, 'C': np.random.randn(12)} df = DataFrame(data) df.loc[2:10:2, 'C'] = np.nan op = lambda x: getattr(x, op_name)() # single column grouped = df.drop(['B'], axis=1).groupby('A') exp = {cat: op(group['C']) for cat, group in grouped} exp = DataFrame({'C': exp}) exp.index.name = 'A' result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(['A', 'B']) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group['C']) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ['A', 'B'] exp.name = 'C' result = op(grouped)['C'] if op_name in ['sum', 'prod']: tm.assert_series_equal(result, exp)
def test_two_nearby_steppers_one_gapped(self): N = 5 Y = 2 # Begin second feature one frame later than the first, so the particle labeling (0, 1) is # established and not arbitrary. a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)}) b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)}) a = a.drop(3).reset_index(drop=True) f = pd.concat([a, b]) expected = f.copy().reset_index(drop=True) expected['particle'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)]) pandas_sort(expected, ['particle', 'frame'], inplace=True) expected.reset_index(drop=True, inplace=True) actual = self.link_df(f, 5) assert_frame_equal(actual, expected) actual_iter = self.link_df_iter(f, 5, hash_size=(50, 50)) assert_frame_equal(actual_iter, expected) # Sort rows by frame (normal use) actual = self.link_df(pandas_sort(f, 'frame'), 5) assert_frame_equal(actual, expected) actual_iter = self.link_df_iter(pandas_sort(f, 'frame'), 5, hash_size=(50, 50)) assert_frame_equal(actual_iter, expected) # Shuffle rows (crazy!) np.random.seed(0) f1 = f.reset_index(drop=True) f1.reindex(np.random.permutation(f1.index)) actual = self.link_df(f1, 5) assert_frame_equal(actual, expected) actual_iter = self.link_df_iter(f1, 5, hash_size=(50, 50)) assert_frame_equal(actual_iter, expected)
def load_abundances(self, tracked_species=None): """ Load chemical abundances array. There are six abundances tracked for each particle. 0:H2 1:HII 2:DII 3:HD 4:HeII 5:HeIII """ default_species = ['H2', 'HII', 'DII', 'HD', 'HeII', 'HeIII'] if tracked_species is None: tracked_species = default_species abundances = self._ChemicalAbundances.value abundances = DataFrame(abundances, index=self._particleIDs.value, columns=tracked_species) if self._drop_ids is not None: abundances.drop(self._drop_ids) self[tracked_species] = abundances
def test_two_isolated_steppers_one_gapped(self): N = 5 Y = 25 # Begin second feature one frame later than the first, # so the particle labeling (0, 1) is established and not arbitrary. a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)}) a = a.drop(3).reset_index(drop=True) b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)}) f = pd.concat([a, b]) expected = f.copy() expected['particle'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)]) pandas_sort(expected, ['particle', 'frame'], inplace=True) expected.reset_index(drop=True, inplace=True) actual = self.link(f, 5) assert_traj_equal(actual, expected) # link_df_iter() tests not performed, because hash_size is # not knowable from the first frame alone. # Sort rows by frame (normal use) actual = self.link(pandas_sort(f, 'frame'), 5) assert_traj_equal(actual, expected) # Shuffle rows (crazy!) np.random.seed(0) f1 = f.reset_index(drop=True) f1.reindex(np.random.permutation(f1.index)) actual = self.link(f1, 5) assert_traj_equal(actual, expected)
def get_quote(self, symbols, dataframe = True): if isinstance(symbols, list) or isinstance(symbols, set) or isinstance(symbols, tuple): symbolList = list(symbols) elif isinstance(symbols, str): symbolList = symbols.split(',') symbols = util.symbols_to_string(symbols) url = URL_QUOTATION(symbols) retry = True while retry: try: quote =self.session.get( URL_QUOTATION(symbols) , timeout = 0.1 ).text retry = False except: pass quoteList = re.findall(r'\"(.*)\"', quote) if dataframe: for i in range( 0, len(quoteList) ): quoteList[i] = quoteList[i].split(',') else: for i in range( 0, len(quoteList) ): quoteList[i] = quoteList[i].split(',') quoteList[i].append( symbolList[i] ) if dataframe: df_quote = DataFrame( quoteList, columns = SINA_QUOTE_COLUMNS ) df_quote = df_quote.drop( 'ms', axis = 1 ) df_quote["symbol"] = symbolList return df_quote else: return quoteList
def test_v12_compat(self): df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], [-0.2550111, -0.08072427, -0.03202878, -0.17581665], [1.51493992, 0.11805825, 1.629455, -1.31506612], [-0.02765498, 0.44679743, 0.33192641, -0.27885413], [0.05951614, -2.69652057, 1.28163262, 0.34703478], ], columns=["A", "B", "C", "D"], index=pd.date_range("2000-01-03", "2000-01-07"), ) df["date"] = pd.Timestamp("19920106 18:21:32.12") df.ix[3, "date"] = pd.Timestamp("20130101") df["modified"] = df["date"] df.ix[1, "modified"] = pd.NaT v12_json = os.path.join(self.dirpath, "tsframe_v012.json") df_unser = pd.read_json(v12_json) assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json") df_unser_iso = pd.read_json(v12_iso_json) assert_frame_equal(df_iso, df_unser_iso)
def get_flights_from_route(cur, origin, destination): """ Returns a dataframe for all flights matching origin, destination. """ import time ### MySQL query time0 = time.time() cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination)) rows = cur.fetchall() td = time.time() - time0 print 'Database query took %.2f seconds.' % td ### Convert to dataframe df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay']) ### Drop columns without delays (cancellations) df = df.dropna() ### Create some auxiliary columns df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1) df['Week'] = df['DayOfYear'] / 7 + 1 df['DepHour'] = df['CRSDepTime']/100 ### Drop unused columns df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1) ## df.head() return df
def _fix_dframe_for_libecl(dframe: pd.DataFrame) -> pd.DataFrame: """Fix a dataframe making it ready for EclSum.from_pandas() * Ensures that the index is always datetime, and sorted. * Removes BLOCK vectors, these are currently not supported as it requires knowledge of the grid dimensions. Warnings will be emitted for skipped columns Args: dframe: Dataframe to read. Will not be modified. Returns: Modified copy of incoming dataframe. """ if dframe.empty: return dframe dframe = dframe.copy() if "DATE" in dframe.columns: # Infer datatype (Pandas cannot answer it) based on the first element: if isinstance(dframe["DATE"].values[0], pd.Timestamp): dframe["DATE"] = pd.Series(pd.to_pydatetime(dframe["DATE"]), dtype="object") if isinstance(dframe["DATE"].values[0], str): # Do not use pd.Series.apply() here, Pandas would try to convert it to # datetime64[ns] which is limited at year 2262. dframe["DATE"] = pd.Series( [dateutil.parser.parse(datestr) for datestr in dframe["DATE"]], dtype="object", index=dframe.index, ) if isinstance(dframe["DATE"].values[0], dt.date): dframe["DATE"] = pd.Series( [ dt.datetime.combine(dateobj, dt.datetime.min.time()) for dateobj in dframe["DATE"] ], dtype="object", index=dframe.index, ) dframe.set_index("DATE", inplace=True) if not isinstance(dframe.index.values[0], (dt.datetime, np.datetime64, pd.Timestamp)): raise ValueError( "dataframe must have a datetime index, got %s of type %s" % (dframe.index.values[0], type(dframe.index.values[0]))) dframe.sort_index(axis=0, inplace=True) # This column will appear if dataframes are naively written to CSV # files and read back in again. if "Unnamed: 0" in dframe: dframe.drop("Unnamed: 0", axis="columns", inplace=True) block_columns = [ col for col in dframe.columns if (col.startswith("B") or col.startswith("LB")) ] if block_columns: dframe = dframe.drop(columns=block_columns) logger.warning( "Dropped columns with block data, not supported: %s", str({ colname.partition(":")[0] + ":*" for colname in block_columns }), ) return dframe
def _prepare_and_save_submit(submit: pd.DataFrame, config: InferenceConfig): submit['Id'] = submit['image_name'] submit.drop(['kind', 'image_name', 'label'], axis=1, inplace=True) submit.to_csv(config.sumbit_name, index=False)
def gen_interaction_df(df: pd.DataFrame, response: str, degree: int=2, inter_only: bool=False, bias: bool=False): x_interaction = PolynomialFeatures(degree=2, interaction_only=inter_only, include_bias=False).fit_transform(df.drop(columns=response)) interaction_df = pd.DataFrame(x_interaction, columns=gen_column_names(df.drop(columns=response), inter_only)) return interaction_df.join(df[response])
X, y = make_moons(n_samples=10000, noise=0.2) df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) # scatter plot, dots colored by class value colors = {0:'red', 1:'blue'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key],s=1) plt.show() #------------------ regressão logistica log_reg = LogisticRegression() log_reg.fit(df.drop('label',axis=1), df['label']) # AUC probs = log_reg.predict_proba(df.drop('label',axis=1)) probs = probs[:, 1] roc_auc_score(df['label'], probs) #----------------- ploting decision boundary #creating grid xx, yy = np.mgrid[-3:3:.01, -3:3:.01] grid = np.c_[xx.ravel(), yy.ravel()] probs = log_reg.predict_proba(grid)[:, 1].reshape(xx.shape)
def df_node_to_cooc(df_dump, context=None): """ converts df_dump to df_cooc + f1_set strategy: (1a) create overlapping contexts with nodes (1b) concatenate local contexts (2a) sort by abs(offset) (2b) deduplicate by cpos, keep first occurrences (=smallest offset) (3a) f1_set = (cpos where offset == 0) (3b) remove rows where cpos in f1_set NB: equivalent to UCS when switching steps 3a and 3b :param DataFrame df_dump: [match, matchend] + [context_id, context, contextend] :param int context: confine context? :return: deduplicated df_cooc [match, cpos, offset] + f1_set (cpos of nodes) :rtype: tuple(DataFrame, set) """ if context == 0: logger.warning("can't work on 0 context") return DataFrame(), set() # reset the index to be able to work with it df = df_dump.reset_index() if context is None: if (df['match'].values == df['context'].values).all() and ( df['matchend'].values == df['contextend'].values).all(): return DataFrame(), set() else: df['start'] = df['context'] df['end'] = df['contextend'] else: logger.info("re-confine regions by given context") df['start'] = df['match'] - context df['start'] = df[['start', 'context']].max(axis=1) df['end'] = df['matchend'] + context df['end'] = df[['end', 'contextend']].min(axis=1) logger.info("(1a) create local contexts") df = DataFrame.from_records(df.apply(node2cooc, axis=1).values) logger.info("(1b) concatenate local contexts") df_infl = DataFrame({ 'match': list(chain.from_iterable(df['match_list'].values)), 'cpos': list(chain.from_iterable(df['cpos_list'].values)), 'offset': list(chain.from_iterable(df['offset_list'].values)) }) logger.info("(2a) sort by absolute offset") df_infl['abs_offset'] = df_infl.offset.abs() df_infl = df_infl.sort_values(by=['abs_offset', 'cpos']) df_infl = df_infl.drop(["abs_offset"], axis=1) logger.info("(2b) drop duplicates") df_defl = df_infl.drop_duplicates(subset='cpos') logger.info("(3a) identify nodes ...") f1_set = set(df_defl.loc[df_defl['offset'] == 0]['cpos']) logger.info("(3b) ... and remove them") df_defl = df_defl[df_defl['offset'] != 0] return df_defl, f1_set
cur.execute(SQL1) gdp=cur.fetchall() cur.execute(SQL2) data=cur.fetchall() #获取表头数据,省去一大堆中文 _=cur.description columns1=[] for x in _: columns1.append(x[0]) cur.close()#关闭连接 #将二维元组转换为DataFame结构,需注意的是二维元组((1,2),(2,3))无法组成DataFrame结构,需要先转换成List data=DataFrame(list(data),columns=columns1) data=data.set_index(data['year']) data=data.drop('year',axis=1) gdp=DataFrame(list(gdp)) gdp=gdp.set_index(gdp[0]) gdp=gdp.drop(0,axis=1) gdp.rename(columns={1:'生产总值'},inplace=True) #gdp=gdp.rename({1:'生产总值'},inplace=True) 此处注意,修改列名时必须指出columns data['生产总值']=gdp #合并表格,将gdp表中的生产总值增添到data中,注:由于将index改为了年份,增添数据时,会自动对齐index ''' 第二种取数方法 直接用SQL语句预处理数据,减少取数 ''' ''' #从数据库中取数
def calculate_signals(self, df: pd.DataFrame, drop_extra_columns=True): n1 = self.n n2 = self.scale * n1 df['median'] = df['close'].rolling(window=n2).mean() df['std'] = df['close'].rolling(n2, min_periods=1).std( ddof=0) # ddof代表标准差自由度 df['z_score'] = abs(df['close'] - df['median']) / df['std'] df['m'] = df['z_score'].rolling(window=n2).mean() df['upper'] = df['median'] + df['std'] * df['m'] df['lower'] = df['median'] - df['std'] * df['m'] condition_long = df['close'] > df['upper'] condition_short = df['close'] < df['lower'] df['mtm'] = df['close'] / df['close'].shift(n1) - 1 df['mtm_mean'] = df['mtm'].rolling(window=n1, min_periods=1).mean() # 基于价格atr,计算波动率因子wd_atr df['c1'] = df['high'] - df['low'] df['c2'] = abs(df['high'] - df['close'].shift(1)) df['c3'] = abs(df['low'] - df['close'].shift(1)) df['tr'] = df[['c1', 'c2', 'c3']].max(axis=1) df['atr'] = df['tr'].rolling(window=n1, min_periods=1).mean() df['avg_price'] = df['close'].rolling(window=n1, min_periods=1).mean() df['wd_atr'] = df['atr'] / df['avg_price'] # 参考ATR,对MTM指标,计算波动率因子 df['mtm_l'] = df['low'] / df['low'].shift(n1) - 1 df['mtm_h'] = df['high'] / df['high'].shift(n1) - 1 df['mtm_c'] = df['close'] / df['close'].shift(n1) - 1 df['mtm_c1'] = df['mtm_h'] - df['mtm_l'] df['mtm_c2'] = abs(df['mtm_h'] - df['mtm_c'].shift(1)) df['mtm_c3'] = abs(df['mtm_l'] - df['mtm_c'].shift(1)) df['mtm_tr'] = df[['mtm_c1', 'mtm_c2', 'mtm_c3']].max(axis=1) df['mtm_atr'] = df['mtm_tr'].rolling(window=n1, min_periods=1).mean() # 参考ATR,对MTM mean指标,计算波动率因子 df['mtm_l_mean'] = df['mtm_l'].rolling(window=n1, min_periods=1).mean() df['mtm_h_mean'] = df['mtm_h'].rolling(window=n1, min_periods=1).mean() df['mtm_c_mean'] = df['mtm_c'].rolling(window=n1, min_periods=1).mean() df['mtm_c1'] = df['mtm_h_mean'] - df['mtm_l_mean'] df['mtm_c2'] = abs(df['mtm_h_mean'] - df['mtm_c_mean'].shift(1)) df['mtm_c3'] = abs(df['mtm_l_mean'] - df['mtm_c_mean'].shift(1)) df['mtm_tr'] = df[['mtm_c1', 'mtm_c2', 'mtm_c3']].max(axis=1) df['mtm_atr_mean'] = df['mtm_tr'].rolling(window=n1, min_periods=1).mean() indicator = 'mtm_mean' # mtm_mean指标分别乘以三个波动率因子 df[indicator] = 1e5 * df['mtm_atr'] * df['mtm_atr_mean'] * df[ 'wd_atr'] * df[indicator] # 对新策略因子计算自适应布林 df['median'] = df[indicator].rolling(window=n1).mean() df['std'] = df[indicator].rolling(n1, min_periods=1).std( ddof=0) # ddof代表标准差自由度 df['z_score'] = abs(df[indicator] - df['median']) / df['std'] # df['m'] = df['z_score'].rolling(window=n1).max().shift(1) # df['m'] = df['z_score'].rolling(window=n1).mean() df['m'] = df['z_score'].rolling(window=n1).min().shift(1) df['up'] = df['median'] + df['std'] * df['m'] df['dn'] = df['median'] - df['std'] * df['m'] # 突破上轨做多 condition1 = df[indicator] > df['up'] condition2 = df[indicator].shift(1) <= df['up'].shift(1) condition = condition1 & condition2 df.loc[condition, 'signal_long'] = 1 # 突破下轨做空 condition1 = df[indicator] < df['dn'] condition2 = df[indicator].shift(1) >= df['dn'].shift(1) condition = condition1 & condition2 df.loc[condition, 'signal_short'] = -1 # 均线平仓(多头持仓) condition1 = df[indicator] < df['median'] condition2 = df[indicator].shift(1) >= df['median'].shift(1) condition = condition1 & condition2 df.loc[condition, 'signal_long'] = 0 # 均线平仓(空头持仓) condition1 = df[indicator] > df['median'] condition2 = df[indicator].shift(1) <= df['median'].shift(1) condition = condition1 & condition2 df.loc[condition, 'signal_short'] = 0 df.loc[condition_long, 'signal_short'] = 0 df.loc[condition_short, 'signal_long'] = 0 # ===由signal计算出实际的每天持有仓位 # signal的计算运用了收盘价,是每根K线收盘之后产生的信号,到第二根开盘的时候才买入,仓位才会改变。 df['signal_short'].fillna(method='ffill', inplace=True) df['signal_long'].fillna(method='ffill', inplace=True) df['signal'] = df[['signal_long', 'signal_short' ]].sum(axis=1, min_count=1, skipna=True) # 若你的pandas版本是最新的,请使用本行代码代替上面一行 temp = df[df['signal'].notnull()][['signal']] temp = temp[temp['signal'] != temp['signal'].shift(1)] df['signal'] = temp['signal'] # df.drop(['signal_long', 'signal_short'], axis=1, inplace=True) df.drop([ 'mtm', 'mtm_l', 'mtm_h', 'mtm_c', 'atr', 'z_score', 'c1', 'c2', 'c3', 'tr', 'avg_price', 'wd_atr', 'mtm_c3', 'mtm_tr', 'mtm_atr', 'mtm_l_mean', 'mtm_h_mean', 'mtm_c_mean', 'mtm_atr_mean', 'mtm_c2', 'mtm_c1' ], axis=1, inplace=True) return df
# 0 26 Ken # 1 29 Jerry # 访问指定位置 print(df1.at[1, 'name']) # Jerry # 修改列名 df1.columns = ['Age', 'Name'] print(df1) # Age Name # 0 26 Ken # 1 29 Jerry # 2 24 Ben #增加行,改变原df df1.loc[len(df1)] = [24, 'qin'] print(df1) #增加列,改变原df df1['Sex'] = [1, 1, 2, 1] print(df1) #删除行,不改变原df df2 =df1.drop(1, axis=0) print(df1) print(df2) #删除列,不改变原df df3 = df1.drop('Name', axis=1) print(df1) print(df3)
def check_timestamp(self, d: pd.DataFrame) -> pd.DataFrame: if all(d["timestamp"].isna()): d = d.drop(columns=["timestamp"]).reset_index(drop=True) return d
def _pca_transform(self, df: pd.DataFrame, n_components: int): pca = PCA(n_components) return pca.fit_transform(df.drop('Platform', axis=1).values)
def general_data_processing(self, X: DataFrame, X_test: DataFrame, holdout_frac: float, num_bagging_folds: int): """ General data processing steps used for all models. """ X = copy.deepcopy(X) # TODO: We should probably uncomment the below lines, NaN label should be treated as just another value in multiclass classification -> We will have to remove missing, compute problem type, and add back missing if multiclass # if self.problem_type == MULTICLASS: # X[self.label] = X[self.label].fillna('') # Remove all examples with missing labels from this dataset: missinglabel_inds = [ i for i, j in enumerate(X[self.label].isna()) if j ] if len(missinglabel_inds) > 0: logger.warning( f"Warning: Ignoring {len(missinglabel_inds)} (out of {len(X)}) training examples for which the label value in column '{self.label}' is missing" ) X = X.drop(missinglabel_inds, axis=0) if self.problem_type is None: self.problem_type = self.get_problem_type(X[self.label]) if X_test is not None and self.label in X_test.columns: # TODO: This is not an ideal solution, instead check if bagging and X_test exists with label, then merge them prior to entering general data processing. # This solution should handle virtually all cases correctly, only downside is it might cut more classes than it needs to. self.threshold, holdout_frac, num_bagging_folds = self.adjust_threshold_if_necessary( X[self.label], threshold=self.threshold, holdout_frac=1, num_bagging_folds=num_bagging_folds) else: self.threshold, holdout_frac, num_bagging_folds = self.adjust_threshold_if_necessary( X[self.label], threshold=self.threshold, holdout_frac=holdout_frac, num_bagging_folds=num_bagging_folds) if (self.objective_func is not None) and (self.objective_func.name == 'log_loss') and (self.problem_type == MULTICLASS): X = self.augment_rare_classes(X) # Gets labels prior to removal of infrequent classes y_uncleaned = X[ self.label].copy() # .astype('category').cat.categories self.cleaner = Cleaner.construct(problem_type=self.problem_type, label=self.label, threshold=self.threshold) # TODO: What if all classes in X are low frequency in multiclass? Currently we would crash. Not certain how many problems actually have this property X = self.cleaner.fit_transform( X) # TODO: Consider merging cleaner into label_cleaner self.label_cleaner = LabelCleaner.construct( problem_type=self.problem_type, y=X[self.label], y_uncleaned=y_uncleaned) if (self.label_cleaner.num_classes is not None) and (self.label_cleaner.num_classes == 2): self.trainer_problem_type = BINARY else: self.trainer_problem_type = self.problem_type X, y = self.extract_label(X) y = self.label_cleaner.transform(y) if X_test is not None and self.label in X_test.columns: X_test = self.cleaner.transform(X_test) if len(X_test) == 0: logger.debug( 'All X_test data contained low frequency classes, ignoring X_test and generating from subset of X' ) X_test = None y_test = None else: X_test, y_test = self.extract_label(X_test) y_test = self.label_cleaner.transform(y_test) else: y_test = None # TODO: Move this up to top of data before removing data, this way our feature generator is better if X_test is not None: # Do this if working with SKLearn models, otherwise categorical features may perform very badly on the test set logger.log( 15, 'Performing general data preprocessing with merged train & validation data, so validation performance may not accurately reflect performance on new test data' ) X_super = pd.concat([X, X_test], ignore_index=True) X_super = self.feature_generator.fit_transform( X_super, banned_features=self.submission_columns, drop_duplicates=False) X = X_super.head(len(X)).set_index(X.index) X_test = X_super.tail(len(X_test)).set_index(X_test.index) del X_super else: X = self.feature_generator.fit_transform( X, banned_features=self.submission_columns, drop_duplicates=False) return X, y, X_test, y_test, holdout_frac, num_bagging_folds
def variables_targets_split( data: pd.DataFrame, targets: list) -> Tuple[pd.DataFrame, pd.DataFrame]: return data[targets], data.drop(available_targets, axis=1)
# In[2]: X, y = make_blobs(n_samples=1000, centers=2, n_features=2) # In[3]: df = DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y)) # In[4]: df # In[5]: X = df.drop('label', axis=1) y = df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42) # # Printing the Dataset # In[6]: col = {0: 'red', 1: 'blue'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=col[key])
json_text = json.load(json_file) messages = json_text['messages'] b = DataFrame(messages) mydata = DataFrame(messages, columns=['from', 'type', 'text']) indxs_drop = [] for ii in range(len(mydata)): if mydata['type'][ii] == 'service': indxs_drop = indxs_drop + [ii] elif type(mydata['text'].values[ii]) == list: #print(ii) mydata['text'].values[ii] = mydata['text'].values[ii][0] if type(mydata['text'].values[ii]) == dict: mydata['text'].values[ii] = mydata['text'].values[ii]['text'] mydata = mydata.drop(indxs_drop) del mydata['type'] all_actors = mydata['from'].unique() #%% engineer the text import nltk import string import pattern # Importing FreqDist library from nltk and passing token into FreqDist from nltk.probability import FreqDist my_stop_words = [ "a", "abbastanza", "abbia", "abbiamo", "abbiano", "abbiate", "accidenti", "ad", "adesso", "affinché", "agl", "agli", "ahime", "ahimè", "ai", "al", "alcuna", "alcuni", "alcuno", "all", "alla", "alle", "allo", "allora", "altre", "altri", "altrimenti", "altro", "altrove", "altrui", "anche",
class TestPivotTable(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): self.data = DataFrame({ 'A': [ 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo' ], 'B': [ 'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one' ], 'C': [ 'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny' ], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11) }) def test_pivot_table(self): rows = ['A', 'B'] cols = 'C' table = pivot_table(self.data, values='D', rows=rows, cols=cols) table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, rows) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_nocols(self): df = DataFrame({ 'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'], 'values': [1, 2, 3] }) rs = df.pivot_table(cols='cols', aggfunc=np.sum) xp = df.pivot_table(rows='cols', aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(cols='cols', aggfunc={'values': 'mean'}) xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T tm.assert_frame_equal(rs, xp) def test_pass_array(self): result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) expected = self.data.pivot_table('D', rows='A', cols='C') tm.assert_frame_equal(result, expected) def test_pass_function(self): result = self.data.pivot_table('D', rows=lambda x: x // 5, cols=self.data.C) expected = self.data.pivot_table('D', rows=self.data.index // 5, cols='C') tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): rows = ['A', 'B'] cols = 'C' table = pivot_table(self.data, rows=rows, cols=cols) expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): # can convert dtypes f = DataFrame({ 'a': ['cat', 'bat', 'cat', 'bat'], 'v': [1, 2, 3, 4], 'i': ['a', 'b', 'a', 'b'] }) self.assert_(f.dtypes['v'] == 'int64') z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64=2)) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame({ 'a': ['cat', 'bat', 'cat', 'bat'], 'v': [1.5, 2.5, 3.5, 4.5], 'i': ['a', 'b', 'a', 'b'] }) self.assert_(f.dtypes['v'] == 'float64') z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64=2)) tm.assert_series_equal(result, expected) def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], rows='A', cols=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), rows='A', cols=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({ "a": ['R1', 'R2', nan, 'R4'], 'b': ["C1", "C2", "C3", "C4"], "c": [10, 15, nan, 20] }) result = df.pivot('a', 'b', 'c') expected = DataFrame([[nan, nan, nan, nan], [nan, 10, nan, nan], [nan, nan, nan, nan], [nan, nan, 15, 20]], index=Index(['R1', 'R2', nan, 'R4'], name='a'), columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) tm.assert_frame_equal(result, expected) def test_margins(self): def _check_output(res, col, rows=['A', 'B'], cols=['C']): cmarg = res['All'][:-1] exp = self.data.groupby(rows)[col].mean() tm.assert_series_equal(cmarg, exp) rmarg = res.xs(('All', ''))[:-1] exp = self.data.groupby(cols)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] exp = self.data[col].mean() self.assertEqual(gmarg, exp) # column specified table = self.data.pivot_table('D', rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified table = self.data.pivot_table(rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc='mean') # no rows rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True, aggfunc=np.mean) self.assert_(isinstance(rtable, Series)) for item in ['DD', 'EE', 'FF']: gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) def test_pivot_integer_columns(self): # caused by upstream bug in unstack from pandas.util.compat import product import datetime import pandas d = datetime.date.min data = list( product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in xrange(20)], [1.0])) df = pandas.DataFrame(data) table = df.pivot_table(values=4, rows=[0, 1, 3], cols=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', rows=['0', '1', '3'], cols=['2']) tm.assert_frame_equal(table, table2, check_names=False) def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame({ 'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, 'c': (['foo'] * 4 + ['bar'] * 4) * 2, 'value': np.random.randn(16) }) table = data.pivot_table('value', rows='a', cols=['b', 'c']) grouped = data.groupby(['a', 'b', 'c'])['value'].mean() expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): import datetime import numpy as np import pandas n = 10000 dtype = np.dtype([ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ]) products = np.array([ ('SP500', 'ADBE'), ('SP500', 'NVDA'), ('SP500', 'ORCL'), ('NDQ100', 'AAPL'), ('NDQ100', 'MSFT'), ('NDQ100', 'GOOG'), ('FTSE', 'DGE.L'), ('FTSE', 'TSCO.L'), ('FTSE', 'GSK.L'), ], dtype=[('Index', object), ('Symbol', object)]) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month items['Day'] = dates.day items['Price'] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table('Price', rows=['Month', 'Day'], cols=['Index', 'Symbol', 'Year'], aggfunc='mean') self.assert_(pivoted.columns.is_monotonic) def test_pivot_complex_aggfunc(self): f = {'D': ['std'], 'E': ['sum']} expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') result = self.data.pivot_table(rows='A', cols='B', aggfunc=f) tm.assert_frame_equal(result, expected)
def filter_dataframe(df_raw: pd.DataFrame) -> pd.DataFrame: df_raw = df_raw.drop(['Hogwarts House'], axis=1) df = df_raw.select_dtypes([np.number]) return df
def plot_activity_hours( images: pd.DataFrame, names: Union[list, str, pd.Series], species_col: str = "scientific_name", remove_duplicates: bool = False, remove_duplicates_kws: dict = None, kind: str = "kde", hist_kws: dict = None, kde_kws: dict = None, ) -> matplotlib.axes.Axes: """ Plots the activity hours of one or multiple species by grouping all observations into a 24-hour range. Parameters ---------- images : DataFrame DataFrame with the project's images. names : list, str or Series List of names to plot activity hours for. species_col : str Label of the scientific name column in the images DataFrame. remove_duplicates : bool Whether to remove duplicates. Wrapper for the wiutils.remove_duplicates function. remove_duplicates_kws : dict Keyword arguments for the wiutils.remove_duplicates function. kind : str Type of plot. Values can be: - 'hist' for histogram. - 'kde' for kernel density estimate plot. hist_kws : dict Keyword arguments passed to the seaborn.histplot() function. Only has effect if kind is 'hist'. kde_kws : dict Keyword arguments passed to the seaborn.kde() function. Only has effect if kind is 'kde'. Returns ------- Axes Plot axes. """ if isinstance(names, str): names = [names] if hist_kws is None: hist_kws = {} if kde_kws is None: kde_kws = {} inconsistent_names = set(names) - set(images[species_col]) if len(inconsistent_names): raise ValueError( f"{list(inconsistent_names)} were not found in images.") images = images.copy() if remove_duplicates: images = _remove_wrapper(images, duplicates=True, duplicates_kws=remove_duplicates_kws) images = images.loc[images[species_col].isin(names), :].reset_index( drop=True) images[_labels.date] = pd.to_datetime(images[_labels.date]) images["hour"] = images[_labels.date].dt.round("H").dt.hour images = images.drop(columns=_labels.date) if kind == "hist": ax = sns.histplot( data=images, x="hour", hue=species_col, binwidth=1, binrange=(-0.5, 23.5), discrete=False, **hist_kws, ) elif kind == "kde": ax = sns.kdeplot(data=images, x="hour", hue=species_col, **kde_kws) else: raise ValueError("kind must be one of ['hist', 'kde']") ax.set_xlim(-1, 24) ax.set_xticks(range(0, 24, 2), labels=[f"{h:02}:00" for h in range(0, 24, 2)]) return ax
def forecast(file, column): # Read the data to a DataFrame df = DataFrame() df = read_csv(file) # Delete remaining columns columns_list = list(df.columns.values) columns_list.remove(column) for i in range(len(columns_list)): df.drop(columns_list[i], axis=1, inplace=True) series = Series(list(df[column]), index=list(df.index)) series.index.name = 'Data' # Split data into train and test-data frames df_test = df[(len(df) - 12):] df_train = df[:-12] y_true = list(df_test[column]) # Transform data to be stationary raw_values = series.values print(type(raw_values)) diff_values = difference(raw_values, 1) # Transform data to be supervised learning supervised = timeseries_to_supervised(diff_values, 1) supervised_values = supervised.values # Split data into train and test-sets train, test = supervised_values[0:-12], supervised_values[-12:] # Transform the scale of the data scaler, train_scaled, test_scaled = scale(train, test) # Building the model - parametrization: # 1. train set # 2. batch_size # 3. number_epochs # 4. number_neurons start = time.time() lstm_model = fit_lstm(train_scaled, 1, 250, 24) # Forecast the entire training dataset to build up state for forecasting train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1) lstm_model.predict(train_reshaped, batch_size=1) end = time.time() print("Execution time: " + str(end - start) + " s") # Walk-forward validation on the test data y_pred = list() for i in range(len(test_scaled)): # Make one-step forecast X, y = test_scaled[i, 0:-1], test_scaled[i, -1] yhat = forecast_lstm(lstm_model, 1, X) # Invert scaling yhat = invert_scale(scaler, X, yhat) # Invert differencing yhat = inverse_difference(raw_values, yhat, len(test_scaled) + 1 - i) # Store forecast y_pred.append(yhat) expected = raw_values[len(train) + i + 1] print('Month=%d, Predicted=%f, Expected=%f' % (i + 1, yhat, expected)) common_methods.model_evaluation(y_true, y_pred) common_methods.plot(column, df_train, y_true, y_pred)
def plot_graphs(granularity: str) -> None: push_data_db = ( test_scheduling.PUSH_DATA_GROUP_DB if granularity == "group" else test_scheduling.PUSH_DATA_CONFIG_GROUP_DB ) assert db.download(push_data_db) regressions_by_rev = {} for revisions, _, _, possible_regressions, likely_regressions in db.read( push_data_db ): regressions_by_rev[revisions[0]] = get_regressions( granularity, likely_regressions, possible_regressions ) scheduled_data = [] caught_data = [] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB): if len(scheduler_stat["schedulers"]) == 0: continue if scheduler_stat["id"] not in regressions_by_rev: continue obj: dict[str, Any] = { "date": datetime.utcfromtimestamp(scheduler_stat["date"]), } for scheduler in scheduler_stat["schedulers"]: obj[scheduler["name"]] = len(get_scheduled(granularity, scheduler)) scheduled_data.append(obj) regressions = regressions_by_rev[scheduler_stat["id"]] obj = { "date": datetime.utcfromtimestamp(scheduler_stat["date"]), "regressions": len(regressions), } for scheduler in scheduler_stat["schedulers"]: scheduled = get_scheduled(granularity, scheduler) obj[scheduler["name"]] = len(regressions & scheduled) caught_data.append(obj) scheduled_df = DataFrame(scheduled_data) scheduled_df.index = scheduled_df["date"] del scheduled_df["date"] caught_df = DataFrame(caught_data) caught_df.index = caught_df["date"] del caught_df["date"] df = scheduled_df.resample("W").mean() plot_graph( df, f"Average number of scheduled {granularity}s", f"average_{granularity}_scheduled.svg", ) df = ( caught_df[caught_df.regressions > 0] .drop(columns=["regressions"]) .clip(0, 1) .resample("W") .mean() ) plot_graph( df, "Percentage of regressing pushes where we caught at least one regression", f"percentage_{granularity}_caught_at_least_one.svg", ) plot_graph( caught_df.drop(columns=["regressions"]) .div(caught_df.regressions, axis=0) .resample("W") .mean(), "Percentage of regressions we caught", f"percentage_{granularity}_caught.svg", )
copy 默认为True,无论如何都复制;如果为False,则新旧相等就不复制 ''' print(frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', copy=False)) print('\n5.2.2.丢弃指定轴上的项\n') # 丢弃某条轴上的一个或多个项很简单,只要有一个索引数组或列表即可。 # 由于需要执行一些数据整理和集合逻辑,所以drop方法返回的是一个在指定轴上删除了指定值的新对象。 obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) new_obj = obj.drop('c') print(new_obj) print(obj.drop(['c', 'd'])) # 对于DataFrame,可以删除任意轴上的索引值 data = DataFrame(np.arange(16).reshape(4, 4), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) print(data, data.drop(['Colorado', 'Ohio']), data.drop('two', axis=1), data.drop(['two', 'four'], axis=1), '', sep='\n') print('\n5.2.3.索引、选取和过滤\n') # Series索引(obj[...])的工作方式类似于NumPy数组的索引,只不过Series的索引值不是整数 obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd']) print(obj['b'], obj[1], obj[2: 4], obj[['b', 'a', 'd']], obj[[1, 3]], obj[obj <2], sep='\n') # 利用标签的切片运算与普通的Python切片运算不同,其末端是包含的 print(obj['b': 'c']) obj['b', 'c'] = 5 print(obj) # 对DataFrame进行索引其实就是获取一个或多个列 data = DataFrame(np.arange(16).reshape(4, 4),
from pandas import Series, DataFrame import google_auth as ga import ranking gc = ga.gauth() # Spreadsheetを取得 SPREADSHEET_KEY = 'YOUR_SHEETID_HERE' # Google Formsの出力先シートのID worksheet = gc.open_by_key(SPREADSHEET_KEY).sheet1 # ---ここからデータ処理--- df = DataFrame(worksheet.get_all_values()) initial_columns_list = list(df.iloc[0, :]) df.columns = initial_columns_list df.drop(0, inplace=True) df.reset_index(inplace=True) df.drop('index', axis=1, inplace=True) df['氏名'] = df['学年'].astype(int) df['練習回数'] = df['希望する練習回数'].astype(int) days = ['月', '火', '水', '木', '金'] times = ['朝', '夜'] daytime_list = [] for day in days: for time in times: daytime_list.append(day + time) for daytime in daytime_list: df[daytime] = 0
def most_popular_wines(adjacency_matrix: pd.DataFrame) -> List[int]: most_popular = np.argsort( adjacency_matrix.drop("user_id", axis=1).sum(axis=0)).index return most_popular
def generate_new_datetime_features(self, X: pd.DataFrame) -> pd.DataFrame: ''' Method generates new datetime features generated based on datetime features. Args: X: a dataset to add new features to Returns: Dataset with new generated features ''' days_before_next_weekend = {0: 5, 1: 4, 2: 3, 3: 2, 4: 1, 5: 0, 6: 0} days_before_next_weekdays = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 2, 6: 1} month_to_season = dict( zip(range(1, 13), [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1])) self.holidays = pd.to_datetime(pd.Series(ru_holidays)) min_dt = X[self.dt_features].min().min() for col in self.dt_features: if self.task_type['is_timeseries'] or len( self.f_spaces['datetime']) < 3: X[col] = X[col].fillna(datetime.datetime(1970, 1, 1)) X['TS_{}_year'.format(col)] = X[col].apply( lambda x: x.year).astype(np.int16) X['TS_{}_month'.format(col)] = X[col].apply( lambda x: x.month).astype(np.int8) X['TS_{}_day'.format(col)] = X[col].apply( lambda x: x.day).astype(np.int8) X['TS_{}_hour'.format(col)] = X[col].apply( lambda x: x.hour).astype(np.int8) X['TS_{}_minute'.format(col)] = X[col].apply( lambda x: x.minute).astype(np.int8) X['TS_{}_weekday'.format(col)] = \ X[col].apply(lambda x: x.weekday()).astype(np.int8) X['TS_{}_season'.format(col)] = \ X['TS_{}_month'.format(col)].map(month_to_season).astype(np.int8) X['TS_{}_hour_of_week'.format(col)] = \ X[col].apply(lambda x: x.weekday() * 24).astype(np.int16) X['TS_{}_is_holiday'.format(col)] = \ X[col].dt.date.isin(self.holidays.dt.date).astype(np.int8) X['TS_{}_is_weekend'.format(col)] = \ X['TS_{}_weekday'.format(col)].map({5: 1, 6: 1}).fillna(0).astype(np.int8) X['TS_{}_is_weekend'.format(col)] = (X['TS_{}_is_weekend'.format(col)] +\ X['TS_{}_is_holiday'.format(col)]).astype(np.int8) X['TS_{}_days_before_weekend'.format(col)] = \ X['TS_{}_weekday'.format(col)].map(days_before_next_weekend).astype(np.int8) year_month_func = lambda x: (x.year - 2010) * 12 + x.month X['TS_{}_year_month'.format(col)] = X[col].apply( year_month_func).astype(np.int16) if self.verbose: n_created = len([col for col in X.columns if 'TS_' in col]) print('FEATURE GENERATOR: {} timeseries datetime features created'. format(n_created)) for col1, col2 in self.new_datetime_features: X['new_datetime_diff' + col1 + '_' + col2] = \ (X[col1] - X[col2]).astype('timedelta64[D]').fillna(-1).astype(np.int16) if self.verbose: n_created = len(self.new_datetime_features) print( 'FEATURE GENERATOR: {} datetime diff features created'.format( n_created)) X.drop(self.f_spaces['datetime'], axis=1, inplace=True) return X
def _clean(df: pd.DataFrame) -> pd.DataFrame: """Ensure no duplicates and ascending sorting of diven df.""" df = df.sort_index(ascending=True) df.drop(index=df[df.index.duplicated()].index, inplace=True) return df
def pipe_add_metadata(self, df: pd.DataFrame) -> pd.DataFrame: return df.drop(columns="Region").assign(location=self.location, source_url=self.source_url_ref)
def test_multiple_date_col_custom(all_parsers, keep_date_col): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers def date_parser(*date_cols): """ Test date parser. Parameters ---------- date_cols : args The list of data columns to parse. Returns ------- parsed : Series """ return parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) result = parser.read_csv(StringIO(data), header=None, date_parser=date_parser, prefix="X", parse_dates={ "actual": [1, 2], "nominal": [1, 3] }, keep_date_col=keep_date_col) expected = DataFrame([ [ datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", "19990127", " 19:00:00", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0 ], [ datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", "19990127", " 20:00:00", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0 ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), "KORD", "19990127", " 21:00:00", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0 ], [ datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), "KORD", "19990127", " 21:00:00", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0 ], [ datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), "KORD", "19990127", " 22:00:00", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0 ], [ datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), "KORD", "19990127", " 23:00:00", " 22:56:00", -0.59, 1.71, 4.6, 0.0, 280.0 ], ], columns=[ "actual", "nominal", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8" ]) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) elif parser.engine == "python": expected["X1"] = expected["X1"].astype(np.int64) # Python can sometimes be flaky about how # the aggregated columns are entered, so # this standardizes the order. result = result[expected.columns] tm.assert_frame_equal(result, expected)
def modify_df(df1: pd.DataFrame, name): df1.rename(columns={'value': name}, inplace=True) df1.drop(columns=['unit'], inplace=True) return df1
def parallel_extraction(df: pd.DataFrame, df_images: pd.DataFrame, df_sources: pd.DataFrame, min_sigma: float, edge_buffer: float, cluster_threshold: float, allow_nan: bool, add_mode: bool, p_run_path: str) -> pd.DataFrame: """ Parallelize forced extraction with Dask Args: df: dataframe with columns 'wavg_ra', 'wavg_dec', 'img_diff', 'detection' df_images: dataframe with the images data and columns 'id', 'measurements_path', 'path', 'noise_path', 'beam_bmaj', 'beam_bmin', 'beam_bpa', 'background_path', 'rms_min', 'datetime', 'skyreg__centre_ra', 'skyreg__centre_dec', 'skyreg__xtr_radius' and 'name' as the index. df_sources: dataframe derived from the measurement data with columns 'source', 'image', 'flux_peak'. min_sigma: minimum sigma value to drop forced extracted measurements. edge_buffer: flag to pass to ForcedPhot.measure method. cluster_threshold: flag to pass to ForcedPhot.measure method. allow_nan: flag to pass to ForcedPhot.measure method. add_mode: True when the pipeline is running in add image mode. p_run_path: The system path of the pipeline run output. Returns: Dataframe with forced extracted measurements data, columns are 'source_tmp_id', 'ra', 'dec', 'image', 'flux_peak', 'island_id', 'component_id', 'name', 'flux_int', 'flux_int_err' """ # explode the lists in 'img_diff' column (this will make a copy of the df) out = ( df.rename(columns={ 'img_diff': 'image', 'source': 'source_tmp_id' }) # merge the rms_min column from df_images .merge(df_images[['rms_min']], left_on='image', right_on='name', how='left').rename(columns={'rms_min': 'image_rms_min'}) # merge the measurements columns 'source', 'image', 'flux_peak' .merge(df_sources, left_on=['source_tmp_id', 'detection'], right_on=['source', 'image'], how='left').drop(columns=['image_y', 'source']).rename( columns={'image_x': 'image'})) # drop the source for which we would have no hope of detecting predrop_shape = out.shape[0] out['max_snr'] = out['flux_peak'].values / out['image_rms_min'].values out = out[out['max_snr'] > min_sigma].reset_index(drop=True) logger.debug("Min forced sigma dropped %i sources", predrop_shape - out.shape[0]) # drop some columns that are no longer needed and the df should look like # out # | | source_tmp_id | wavg_ra | wavg_dec | image_name | flux_peak | # |--:|--------------:|--------:|---------:|:-----------------|----------:| # | 0 | 81 | 317.607 | -8.66952 | VAST_2118-06A... | 11.555 | # | 1 | 894 | 323.803 | -2.6899 | VAST_2118-06A... | 2.178 | # | 2 | 1076 | 316.147 | -3.11408 | VAST_2118-06A... | 6.815 | # | 3 | 1353 | 322.094 | -4.44977 | VAST_2118-06A... | 1.879 | # | 4 | 1387 | 321.734 | -6.82934 | VAST_2118-06A... | 1.61 | out = (out.drop(['max_snr', 'image_rms_min', 'detection'], axis=1).rename(columns={'image': 'image_name'})) # get the unique images to extract from unique_images_to_extract = out['image_name'].unique().tolist() # create a list of dictionaries with image file paths and dataframes # with data related to each images image_data_func = lambda x: { 'image': df_images.at[x, 'path'], 'background': df_images.at[x, 'background_path'], 'noise': df_images.at[x, 'noise_path'], 'df': out[out['image_name'] == x] } list_to_map = list(map(image_data_func, unique_images_to_extract)) # create a list of all the measurements parquet files to extract data from, # such as prefix and max_id list_meas_parquets = list( map(lambda el: df_images.at[el, 'measurements_path'], unique_images_to_extract)) del out, unique_images_to_extract, image_data_func # get a map of the columns that have a fixed value mapping = (db.from_sequence(list_meas_parquets, npartitions=len(list_meas_parquets)).map( get_data_from_parquet, p_run_path, add_mode).compute()) mapping = pd.DataFrame(mapping) # remove not used columns from images_df and merge into mapping col_to_drop = list( filter(lambda x: ('path' in x) or ('skyreg' in x), df_images.columns.values.tolist())) mapping = (mapping.merge(df_images.drop(col_to_drop, axis=1).reset_index(), on='id', how='left').drop('rms_min', axis=1).set_index('name')) del col_to_drop n_cpu = cpu_count() - 1 bags = db.from_sequence(list_to_map, npartitions=len(list_to_map)) forced_dfs = (bags.map( lambda x: extract_from_image(edge_buffer=edge_buffer, cluster_threshold=cluster_threshold, allow_nan=allow_nan, **x)).compute()) del bags # create intermediates dfs combining the mapping data and the forced # extracted data from the images intermediate_df = list( map(lambda x: { **(mapping.loc[x['image'], :].to_dict()), **x }, forced_dfs)) # compute the rest of the columns intermediate_df = (db.from_sequence(intermediate_df).map( lambda x: finalise_forced_dfs(**x)).compute()) df_out = (pd.concat(intermediate_df, axis=0, sort=False).rename(columns={ 'wavg_ra': 'ra', 'wavg_dec': 'dec', 'image_name': 'image' })) return df_out
def ens_mouse_to_ens_human(df_unmapped: pd.DataFrame, drop_unmapped: bool=False, verbose: bool=False) -> None: """ Maps mouse ensembl gene id's to human ensembl gene id's. Args: df_unmapped: a dataframe in tidy-format. drop_unmapped: True: remove unmapped genes (rows) from df, False: keep original index verbose: explicitly print status or not Returns: None Todo: * modify drop_unmapped to unmapped: {"drop", "keep", "na"} * make one mapping-function for all cases * support for custom mapping file * handle case for empty df """ assert (len(df_unmapped) > 0), "Empty dataframe." PREFIX = "ENSMUSG" if verbose: print("Mapping: mouse ensembl gene id's --> human ensembl gene id's ...") # Check that genes are correct format mask_peek = np.array([PREFIX in str(idx) for idx in df_unmapped.index.values]) if not (mask_peek.any()): print("Dataframe index contains values that are not ensemble format or not mouse ensembl id: ", df_unmapped.index.values[mask_peek]) resource_package = __name__ resource_path = 'maps/hsapiens_mmusculus_unique_orthologs.GRCh37.ens_v91.txt.gz' # Do not use os.path.join() resource_stream = pkg_resources.resource_stream(resource_package, resource_path) df_map = pd.read_csv(resource_stream, compression='gzip', delim_whitespace=True) # create dictionary for mapping mouse ensemble gene id's to human ensembl gene id's map_dict = dict(zip(df_map["mmusculus_homolog_ensembl_gene"].ravel(), \ df_map["ensembl_gene_id"].ravel())) # map genes in-place, # i.e. indexes are replaced directly in df df_unmapped.rename(index=map_dict, inplace=True) if verbose or drop_unmapped: # check for unmapped genes # note the tilde ~ to get genes NOT mapped mask_unmapped = ~df_unmapped.index.isin(df_map["ensembl_gene_id"]) label_unmapped = df_unmapped.index.values[mask_unmapped] # create report n_unmapped = len(label_unmapped) if verbose: n_total = len(df_unmapped) pct = n_unmapped / n_total * 100 print("%.2f pct of genes are unmapped ..." % pct) if drop_unmapped: df_unmapped.drop(index=label_unmapped, inplace=True) n_mapped = len(df_unmapped) if verbose: print("Removed {} unmapped genes ...".format(n_unmapped)) return None
def get_data(): df = DataFrame() df = (ts.get_hist_data('hs300', start='2013-01-01', end='2017-01-01', ktype='D'))[index[:6] + ['price_change']] df = df.sort_index() #ReturnRate=ln(s(t)/s(t-1)) lag=1 df.insert(0, 'ReturnRate', df['close']) temp = 1 for i in df.index: df['ReturnRate'][i] = np.log(df['close'][i] / temp) temp = df['close'][i] #alpha#6 lag=10 df.insert(7, 'alpha#6', df['open']) for i in xrange(10, len(df.index)): df['alpha#6'][i] = np.corrcoef(df['open'][i - 10:i], df['volume'][i - 10:i])[0][1] #alpha#23 lag=20 df.insert(8, 'alpha#23', df['high']) #过去20天最高价的均值 df['alpha#23'][20] = df['high'][:20].sum() / 20.0 for i in xrange(21, len(df.index)): df['alpha#23'][i] = (20 * df['alpha#23'][i - 1] - df['high'][i - 21] + df['high'][i - 1]) / 20.0 ''' plot(df['alpha#23'],df['date'],label='20_high_avg') plot(df['high'],df['date'],label='high') ''' for i in xrange(20, len(df.index)): if df['high'][i] > df['alpha#23'][i]: #今日高于过去20天平均,呈上涨趋势 df['alpha#23'][i] = -1 * (df['high'][i - 2] - df['high'][i]) else: df['alpha#23'][i] = 0 ''' plot(df['alpha#23'],df['date'],label='alpha#23') legend(loc='upper left') show() ''' #alpha#28 lag=5 df.insert(9, 'alpha#28', df['high']) temp = 0 for i in xrange(5, len(df.index)): df['alpha#28'][i] = np.corrcoef( df['v_ma20'][i - 5:i], df['low'][i - 5:i])[0][1] + ( df['high'][i] + df['low'][i]) / 2.0 - df['close'][i] temp = abs(df['alpha#28'][20:]).sum() for i in xrange(lag, len(df.index)): df['alpha#28'][i] = df['alpha#28'][i] / temp #alpha#54 lag=0 df.insert(10, 'alpha#54', df['high']) for i in xrange(len(df.index)): df['alpha#54'][i] = ( -1 * (df['low'][i] - df['close'][i]) * pow(df['open'][i], 5)) / ( (df['low'][i] - df['high'][i]) * pow(df['close'][i], 5)) #alpha#101 lag=0 df.insert(11, 'alpha#101', df['high']) for i in xrange(len(df.index)): df['alpha#101'][i] = (df['close'][i] - df['open'][i]) / ( df['high'][i] - df['low'][i] + 0.001) df = df[lag:] #绘图 if True: plot(df['alpha#6'], label='alpha#6') #plot(df['alpha#23'],label='alpha#23') plot(df['alpha#28'], label='alpha#28') plot(df['alpha#54'], label='alpha#54') plot(df['alpha#101'], label='alpha#101') legend(loc='upper left') show() df = df.drop(['price_change'], axis=1) output = open('raw_data.pkl', 'wb') pickle.dump(df, output) pickle.dump(index, output) output.close()