def _setTopHit(self): #Only 1 hit if len(self.structuralHit) == 1: [(self.topHit, v)] = self.structuralHit.items() return #Check overlap df = DataFrame(index=self.structuralHit, columns=self.structuralHit) for hitName1, hit1 in self.structuralHit.iteritems(): for hitName2, hit2 in self.structuralHit.iteritems(): if hitName1 == hitName2: continue if hit1.location.overlaps(hit2.location): dif = hit1.location - hit2.location if df[hitName2][hitName1] != dif: df[hitName1][hitName2] = dif maxID1, maxID2 = df.max(axis=1).idxmax(), df.max(axis=0).idxmax() #If no overlap if maxID1 != maxID1: print df return #This case is interesting because overlap of blast hit with tracr but no overlap of tracrs #else if overlap if len(self.structuralHit[maxID1].location) > len( self.structuralHit[maxID2].location): self.topHit = maxID1 else: self.topHit = maxID2
def test_fillna_dict_series(self): df = DataFrame({ 'a': [nan, 1, 2, nan, nan], 'b': [1, 2, 3, nan, nan], 'c': [nan, 1, 2, 3, 4] }) result = df.fillna({'a': 0, 'b': 5}) expected = df.copy() expected['a'] = expected['a'].fillna(0) expected['b'] = expected['b'].fillna(5) assert_frame_equal(result, expected) # it works result = df.fillna({'a': 0, 'b': 5, 'd': 7}) # Series treated same as dict result = df.fillna(df.max()) expected = df.fillna(df.max().to_dict()) assert_frame_equal(result, expected) # disable this for now with assertRaisesRegexp(NotImplementedError, 'column by column'): df.fillna(df.max(1), axis=1)
def test_fillna_dict_series(self): df = DataFrame({ "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], }) result = df.fillna({"a": 0, "b": 5}) expected = df.copy() expected["a"] = expected["a"].fillna(0) expected["b"] = expected["b"].fillna(5) tm.assert_frame_equal(result, expected) # it works result = df.fillna({"a": 0, "b": 5, "d": 7}) # Series treated same as dict result = df.fillna(df.max()) expected = df.fillna(df.max().to_dict()) tm.assert_frame_equal(result, expected) # disable this for now with pytest.raises(NotImplementedError, match="column by column"): df.fillna(df.max(1), axis=1)
def ylim(zombies: DataFrame, humans: DataFrame) -> int: """ finds the limit for the y-axis of the plot, i.e. the max population of either zombies or humans throughout the whole simulation :param zombies: :param humans: :return: """ max_zombies = zombies.max().max() max_humans = humans.max().max() return max(max_zombies, max_humans)
def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in # min/max calls on empty Series/DataFrames. See GH:33704 for more # information df = DataFrame(dict(x=pd.to_datetime([]))) expected_dt_series = Series(pd.to_datetime([])) # check axis 0 assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT) assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT) # check axis 1 tm.assert_series_equal(df.min(axis=1), expected_dt_series) tm.assert_series_equal(df.max(axis=1), expected_dt_series)
def __init__(self, data: pd.DataFrame, features): self.data = data self.features = features mmin = data.min() mmax = data.max() feature_size = mmax - mmin margin = 0.4 # margin = 1.4 # margin = 0.0 self.mins = data.min() - feature_size * margin self.maxs = data.max() + feature_size * margin self.limits = np.c_[self.mins, self.maxs] self.feature_size = self.maxs - self.mins
def calc_distance_matrix(G, max_distance=None): """Returns a matrix containing the shortest distance between all nodes in a network Parameters ---------- G : graph A NetworkX graph max_distance : float or None, optional (default='None') The maximum possible distance value in the network. If None, max_distance is the longest shortest path between two nodes of the network (the graph eccentricity) Returns ------- dist_matrix : NumPy array An NxN numpy array. Notes ----- Along the diagonal, the values are all 0. Unconnected nodes have a distance of max_distance to other nodes. """ # Network (collaborator) Distance dist_matrix = nx.all_pairs_shortest_path_length(G) dist_matrix = DataFrame(dist_matrix, index=G.nodes(), columns=G.nodes()) if max_distance is None: max_distance = float(dist_matrix.max().max()) dist_matrix = dist_matrix.fillna(max_distance) # The unconnected ones are infinitely far from the rest diag_idx = np.diag_indices(len(dist_matrix), ndim=2) dist_matrix.values[diag_idx] = 0 return dist_matrix
def normalize(df: DataFrame) -> DataFrame: """Normalizes the data""" ptid_col: DataFrame = get_del_ptid_col(df) df: DataFrame = (df - df.min(axis=0)) / (df.max(axis=0) - df.min(axis=0)) df: DataFrame = concat([ptid_col, df], axis=1) return df
class LogAggregate: def __init__(self, dataset): self.dataset = DataFrame(dataset) def get_median(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']] else: return self.dataset.median()[kwarg['key']] def get_average(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']] else: return self.dataset.mean()[kwarg['key']] def get_min(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']] else: return self.dataset.min()[kwarg['key']] def get_max(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']] else: return self.dataset.max()[kwarg['key']] def get_count(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']] else: return self.dataset.count()[kwarg['key']]
def _set_yaxis_limits(self, series: pandas.DataFrame): """ Sets self.ylimits using the min/max o fthe series values""" y_min = series.min() y_max = series.max() y_limits = (y_min, y_max) self.plotter.y_value_limits = y_limits
def compute_confusion_matrix(target, predicted, normalize=True, sort = True): """ returns a confusion matrix as a data frame with labels Parameters: target (array): The values that are predicted. predicted (array): predicted values. normalize (bool): If True, Normalize normalize (bool): If true sort by value. Returns (DataFrame): df with the confusion matrix. """ # Determine the uniqu values in the target list, sort them and assign as labels. labels = np.unique(list(target)) labels.sort() # Compute the confusion matrix, place into data frame and normailize if desired. confusion = metrics.confusion_matrix(target, predicted, labels) confusion = DataFrame(confusion, index=labels, columns=labels) if normalize: confusion = confusion.apply(lambda x: x / np.sum(x), axis=1) # if sort is true: find the max value for each and then sort, the confusion matrix if sort: #get the max values, order and then use to order the confusion matrix on both axes max_values =confusion.max(axis = 1) max_values.sort(inplace = True, ascending=False) order = max_values.index confusion = confusion.loc[order,order] return confusion
def scale(df: pd.DataFrame, method: str) -> pd.DataFrame: """ scales features using different methods. Parameters ---------- df: pandas.DataFrame method: {"autoscaling", "rescaling", "pareto"} Scaling method. `autoscaling` performs mean centering scaling of features to unitary variance. `rescaling` scales data to a 0-1 range. `pareto` performs mean centering and scaling using the square root of the standard deviation Returns ------- scaled: pandas.DataFrame """ if method == "autoscaling": scaled = (df - df.mean()) / df.std() elif method == "rescaling": scaled = (df - df.min()) / (df.max() - df.min()) elif method == "pareto": scaled = (df - df.mean()) / df.std().apply(np.sqrt) else: msg = "Available methods are `autoscaling`, `rescaling` and `pareto`." raise ValueError(msg) # replace nans generated when dividing by zero scaled[scaled.isna()] = 0 return scaled
def clean(numpy_array): #load your csv data here in numpy_array data=ut.preprocessData(numpy_array) #print dataarray #print data ###### numpy into pandas dataframe df = pd.DataFrame(data) #print df #print df.dtypes df=df.astype('float16') #print df.dtypes ###### generate preprocessed csv file #df.to_csv('preprocessed_data.csv', sep=',',index=False) ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin) df_norm= (df - df.min()) / (df.max()-df.min()) df_norm=df_norm.fillna(-1) ##### generate normalized csv #df_norm.to_csv('normalized_data.csv',sep=',', index=False) return df_norm.as_matrix()
def min_max_scale_df(df: pd.DataFrame) -> pd.DataFrame: """ Scales the data frame values between 0 and 1 across the columns allowing for easier comparison of line shape on plots :param df: data frame to be scaled :return: scaled dataframe """ return df.div(df.max(), axis=1)
def analyze(df: pd.DataFrame): """中身を適当に分析してDataFrameに詰めて返す。""" if isinstance(df, pd.DataFrame): df_result = pd.DataFrame(index=df.columns) df_result["dtype"] = df.dtypes df_result["null"] = df.isnull().sum() df_result["nunique"] = df.nunique() df_result["min"] = df.min() df_result["median"] = df.median() df_result["max"] = df.max() df_result["mode"] = df.mode().transpose()[0] df_result["mean"] = df.mean() df_result["std"] = df.std() # # はずれ値のはずれ度合いを見るためにRobustScalerした結果の絶対値を見てみる。 # numeric_columns = df.select_dtypes(include=np.number).columns # df_result["outlier_size"] = np.nan # df_result.loc[numeric_columns, "outlier_size"] = ( # tk.preprocessing.SafeRobustScaler(clip_range=None) # .fit_transform(df.loc[:, numeric_columns]) # .fillna(0) # .abs() # .max() # .round(decimals=1) # ) return df_result else: raise NotImplementedError()
def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture if isinstance(tz, tzlocal) and is_platform_windows(): pytest.xfail( reason="GH#37659 OSError raised within tzlocal bc Windows " "chokes in times before 1970-01-01") df = DataFrame({ "a": [ Timestamp("2020-01-01 08:00:00", tz=tz), Timestamp("1920-02-01 09:00:00", tz=tz), ], "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], }) res = df.min(axis=1, skipna=False) expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected)
def draw_bar3D(cls, title: str, data: pd.DataFrame) -> Bar3D: """ 根据df内容绘制3D柱状图 :param title: 标题 :param data: 包含三轴数据的dataframe index为x轴 column为Y轴 value为z轴 :return: """ data_list = [] index_list = data.index.tolist() column_list = data.columns.tolist() # 获取dataframe最大最小值 min_data = data.min().min() max_data = data.max().max() # 遍历dataframe,准备待操作数组 for i in range(len(index_list)): for j in range(len(column_list)): # 记录 XYZ temp_list = [index_list[i], column_list[j], data.iloc[i, j]] # print(i,j,index_list[i],column_list[j]) data_list.append(temp_list) c = ( Bar3D(init_opts=opts.InitOpts( width=DEFAULT_WIDTH, animation_opts=opts.AnimationOpts( animation_delay=200, animation_easing="bounceOut"), # 增加启动动效 )).add( series_name=title, data=data_list, xaxis3d_opts=opts.Axis3DOpts(type_="category", data=index_list), yaxis3d_opts=opts.Axis3DOpts(type_="category", data=column_list), zaxis3d_opts=opts.Axis3DOpts(type_="value"), ).set_series_opts(label_opts=opts.LabelOpts(is_show=True)). set_global_opts( title_opts=opts.TitleOpts(title=title, pos_left="0%"), toolbox_opts=opts.ToolboxOpts(), # 显示工具箱 tooltip_opts=opts.TooltipOpts(is_show=True), axispointer_opts=opts.AxisPointerOpts( is_show=True, type_="none"), # 指针移动时显示所有数值 legend_opts=opts.LegendOpts( is_show=True, selected_mode="multiple", # pos_bottom="0%", # pos_right="0%", # orient="vertical", ), # 显示图例说明 # datazoom_opts=[ # opts.DataZoomOpts( # range_start=0, range_end=100, orient="vertical", pos_left="2%" # ), # opts.DataZoomOpts(range_start=0, range_end=100, orient="horizontal"), # ], # 增加缩放配置横纵轴都支持缩放 visualmap_opts=opts.VisualMapOpts(max_=max_data, min_=min_data) # visualmap_opts=opts.VisualMapOpts(type_="color", max_=1, min_=-1), )) return c
def decay(close, kind=None, length=None, mode=None, offset=None, **kwargs): """Indicator: Decay""" # Validate Arguments close = verify_series(close) length = int(length) if length and length > 0 else 5 mode = mode.lower() if isinstance(mode, str) else "linear" offset = get_offset(offset) # Calculate Result _mode = "L" if mode == "exp" or kind == "exponential": _mode = "EXP" diff = close.shift(1) - exp(-length) else: # "linear" diff = close.shift(1) - (1 / length) diff[0] = close[0] tdf = DataFrame({"close": close, "diff": diff, "0": 0}) ld = tdf.max(axis=1) # Offset if offset != 0: ld = ld.shift(offset) # Handle fills if "fillna" in kwargs: ld.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: ld.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it ld.name = f"{_mode}DECAY_{length}" ld.category = "trend" return ld
def testSingle(self, test, fold): # # devents = xgb.DMatrix( test[ self.variables ].values ) # prediction = DataFrame( self.models[fold].predict( devents ) ) # # return DataFrame(dtype = float, data = {"predicted_class":prediction.idxmax(axis=1).values, # "predicted_prob": prediction.max(axis=1).values } ) devents = xgb.DMatrix(test[self.variables].values) prediction = DataFrame(self.models[fold].predict(devents)) # note: this uses idxmax (the column header of the max value) and tries to convert it to a float # therefore renaming of the header should be done AFTER extracting the predicted_class df = DataFrame(dtype=float, data={ "predicted_frac_class": prediction.idxmax(axis=1).values, "predicted_frac_prob": prediction.max(axis=1).values }) # header renaming headers = [] for i in range(0, len(prediction.columns)): headers.append("predicted_frac_prob_" + str(i)) prediction.columns = headers # horizontal concat (adding columns) result = concat([prediction, df], axis=1) return result
def get_answer1(df: pd.DataFrame) -> float: Nstep = 100 df = df2grid(df) df = df.astype(int) Nflashes = 0 for itt in range(0, Nstep): #step 1 df += 1 #step 2 df_has_flashed = pd.DataFrame(False, index=df.index, columns=df.columns) max_val_not_flashed = df.max().max() while max_val_not_flashed > 9: df_flash = df > 9 for x in itertools.product(list(df.index), list(df.columns)): if df_flash.iloc[x] and not df_has_flashed.iloc[x]: #add to all neighbors xn_list = get_neighbor_idx(x, df) for xn in xn_list: df.iloc[xn] += 1 # update df_has_flashed = df_has_flashed | df_flash max_val_not_flashed = df[-df_has_flashed].max().max() #step 3 df[df_has_flashed] = 0 Nflashes += df_has_flashed.sum().sum() return Nflashes
def linear_decay(close, length=None, offset=None, **kwargs): """Indicator: Linear Decay""" # Validate Arguments close = verify_series(close) length = int(length) if length and length > 0 else 5 offset = get_offset(offset) # Calculate Result diff = close.shift(1) - (1 / length) diff[0] = close[0] tdf = DataFrame({"close": close, "diff": diff, "0": 0}) ld = tdf.max(axis=1) # Offset if offset != 0: ld = ld.shift(offset) # Handle fills if "fillna" in kwargs: ld.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: ld.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it ld.name = f"LDECAY_{length}" ld.category = "trend" return ld
def force(G: nx.Graph): df = DataFrame(index=G.nodes(), columns=G.nodes()) for row, data in nx.shortest_path_length(G): for col, dist in data.items(): df.loc[row, col] = dist df = df.fillna(df.max().max()) return df.to_dict()
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series, mapper_dict: Dict) -> DataFrame: """ Get prediction probabilities (if available) or return true and predicted labels """ df_preds = DataFrame(est.predict(X_test), index=X_test.index) if hasattr(est.named_steps["clf"], "predict_proba"): # Get prediction probabilities (if available) df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index) # Append prediction and prediction probabilities df_summ = concat([df_preds, df_probas], axis=1) df_summ.columns = ["predicted_label"] + [ f"probability_of_{i}" for i in range(0, len(np.unique(y_test))) ] # Get label (class) with maximum prediction probability for each row df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1) df_summ["probability_of_max_class"] = df_probas.max(axis=1) # Compare .predict_proba() and manually extracted prediction # probability lhs = df_summ["max_class_number_manually"] rhs = df_summ["predicted_label"].replace(mapper_dict) assert (lhs == rhs).eq(True).all() else: df_summ = df_preds.copy() # Get true label df_summ.insert(0, "true_label", y_test) return df_summ
def generate_animation(df: pd.DataFrame): global y_max dates = df.date.unique() ls = [type(item) for item in dates] plot_title, ax_title, marker_col, value_col = get_titles() # data_filtered = data[data['date'] == dates[0]] marker_col = 'Kanton' progress_bar = st.progress(0) progress_timestep_inc = 1 / len(dates) * 100 progress_timestep = 0 anim = st.empty() y_max = df.max(axis = 0)['value'] value_col = 'value' data.fillna(0) i=1 for dt in dates: data_filtered = df[df['date'] == dt] ts = pd.to_datetime(str(dt)) plot_title = cn.variable_dic[variables[0]] + ', Datum: ' + ts.strftime('%d.%m.%Y') chart = get_bar_chart(data_filtered, plot_title, ax_title, marker_col, value_col, 'Kanton') anim.altair_chart(chart) # chart.bar_chart(data_filtered) if dt == cn.DATE_LIST[-1] or progress_timestep > 100: progress_timestep = 100 progress_bar.progress(progress_timestep) progress_timestep = int(i * progress_timestep_inc) time.sleep(0.8) i += 1
def create_metric_bar_chart_comparison(df: DataFrame, output_filename: str, max_y_limit: int = None): metric = df.columns[1] if max_y_limit: max_y_limit = df.max()[metric] sns.set(style="whitegrid") sns.set_context("paper", rc={ "font.size": 14, "axes.titlesize": 32, "axes.labelsize": 18 }) g = sns.catplot( x="", y=metric, hue="Approach", data=df, kind="bar", height=5, aspect=1, palette=["skyblue", "sandybrown", "green"], ) g.set(ylim=(0, max_y_limit)) g.savefig(output_filename)
def learning (): #input de dados dados = read_csv("lookout_histories.csv") data = DataFrame(dados['history']) # Input to system output = DataFrame(dados['output']) # Comparison output to system output = (output - output.mean()) / (output.max() - output.min()) # Normalization model = Sequential() model.add(Dense(20, input_dim = data.shape[1], activation = 'relu')) model.add(Dense(1, activation = 'sigmoid')) print("Modelo pronto") model.compile(optimizer='rmsprop', loss = 'sparse_categorical_crossentropy', metrics=['accuracy']) # Train the model, iterating on the data in batches of 32 samples model.fit(data, output, epochs=3, batch_size=8) print("Modelo terminado") # evaluate the model scores = model.evaluate(data, output) print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) return()
def createAALstats(df_aal: pd.DataFrame) -> pd.DataFrame: """Group together some basic statistics from each AAL Group""" stat_array = np.array([df_aal.mean(),df_aal.median(),df_aal.min(),df_aal.max()]).T stat_cols = ['Average','Median','Minimum','Maximum'] stat_index = df_aal.mean().index df_stats = pd.DataFrame(stat_array,columns=stat_cols,index=stat_index) return df_stats
def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame( np.random.randn(1000, 3), columns=["A", "B", "C"], index=date_range("1/1/2000", periods=1000), ) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F"))
def guiyi(a, h): b = a.T # 先对原始矩阵转置 c = DataFrame(b) # 将其变成序列 d = (c - c.min(axis=0)) / (c.max(axis=0) - c.min(axis=0)) # 归一化处理 e = np.array(d) # 将其变成归一化矩阵 f = e.T # 再转置回来 g = np.hstack((h, f)) # 返回归一化后的矩阵加入索引的矩阵 return g
def describe(df: pd.DataFrame) -> pd.DataFrame: return pd.concat([ df.mean().rename('mean'), df.median().rename('median'), df.max().rename('max'), df.min().rename('min') ], axis=1).T
def __init__(self, data: pd.DataFrame, cmap: Colormap): super().__init__() self._data = data self.max = data.max().max() self.min = data.min().min() self.normalize = Normalize(self.min, self.max) self.cmap = cmap self.generate_colors()
def _highlight_max(data: pd.DataFrame, color="yellow") -> pd.DataFrame: attr = "background-color: {}".format(color) data = data.astype(float) max_data = data.max(axis=1, level=0) is_max = data.eq(max_data, axis=1) return pd.DataFrame( np.where(is_max, attr, ""), index=data.index, columns=data.columns )
def select_signatures(W: pd.DataFrame, H: pd.DataFrame): """ Scales NMF output by sample and feature totals to select Signatures. ------------------------ Args: * W: input W matrix (K x n_features) * H: input H matrix (n_samples x K) Returns: * W: output W matrix with max_id, max, and max_norm columns * H: output H matrix with max_id, max, and max_norm columns """ Wnorm = W.copy() Hnorm = H.copy() # Scale Matrix for j in range(W.shape[1]): Wnorm.iloc[:,j] *= H.sum(1).values[j] Hnorm.iloc[j,:] *= W.sum(0).values[j] # Normalize Wnorm = Wnorm.div(Wnorm.sum(1),axis=0) Hnorm = Hnorm.div(Hnorm.sum(0),axis=1) H = H.T Hnorm = Hnorm.T # Get Max Values H_max_id = H.idxmax(axis=1, skipna=True).astype('int') H['max'] = H.max(axis=1, skipna=True) H['max_id'] = H_max_id Hnorm['max_norm']=Hnorm.max(axis=1, skipna=True) W_max_id = W.idxmax(axis=1, skipna=True).astype('int') W['max'] = W.max(axis=1, skipna=True) W['max_id'] = W_max_id Wnorm['max_norm']=Wnorm.max(axis=1, skipna=True) H['max_norm'] = Hnorm['max_norm'] W['max_norm'] = Wnorm['max_norm'] _rename = {x:'S'+x for x in list(H)[:-3]} H = H.rename(columns=_rename) W = W.rename(columns=_rename) return W,H
def kmeanCunt(data: DataFrame, k): from sklearn.cluster import KMeans kmodel = KMeans(n_clusters=k) # 建立模型 kmodel.fit(data.values.reshape(len(data), 1)) # 训练模型 c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0) # 输出聚类中心并排序 w = c.rolling(2).mean().iloc[1:] # 相邻2项求中点 作为边界点 w = [0] + list(w[0]) + [data.max()] # 把首末加上 data = pd.cut(data, w)
def __generate_trace(self, objectives: DataFrame, metadata: list = None, legend: str = '', normalize: bool = False, **kwargs): number_of_objectives = objectives.shape[1] if normalize: objectives = (objectives - objectives.min()) / (objectives.max() - objectives.min()) marker = dict( color='rgb(127, 127, 127)', size=3, symbol='x', line=dict( color='rgb(204, 204, 204)', width=1 ), opacity=0.8 ) marker.update(**kwargs) if number_of_objectives == 2: trace = go.Scattergl( x=objectives[0], y=objectives[1], mode='markers', marker=marker, name=legend, customdata=metadata ) elif number_of_objectives == 3: trace = go.Scatter3d( x=objectives[0], y=objectives[1], z=objectives[2], mode='markers', marker=marker, name=legend, customdata=metadata ) else: dimensions = list() for column in objectives: dimensions.append( dict(range=[0, 1], label=self.axis_labels[column:column+1][0] if self.axis_labels[column:column+1] else None, values=objectives[column]) ) trace = go.Parcoords( line=dict(color='blue'), dimensions=dimensions, name=legend, ) return trace
def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], index=date_range('1/1/2000', periods=1000)) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # .item() s = Series([1]) result = s.item() assert result == 1 assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype='float64') tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F')) # compress # GH 6658 s = Series([0, 1., -1], index=list('abc')) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.], index=['b'])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original exp = Series([], dtype='float64', index=Index([], dtype='object')) tm.assert_series_equal(result, exp) s = Series([0, 1., -1], index=[.1, .2, .3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.], index=[.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original exp = Series([], dtype='float64', index=Index([], dtype='float64')) tm.assert_series_equal(result, exp)
def _to_labels(probabilities: pd.DataFrame) -> pd.Series: labels = probabilities.idxmax(axis='columns') # Find places where there are multiple maximum values max_probabilities = probabilities.max(axis='columns') is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows') number_of_max: pd.Series = is_max.sum(axis='columns') multiple_max: pd.Series = number_of_max.gt(1) # Set those locations as an 'undecided' label labels[multiple_max] = 'undecided' # TODO: emit a warning if any are set to 'undecided' return labels
def test_fillna_dict_series(self): df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], 'b': [1, 2, 3, np.nan, np.nan], 'c': [np.nan, 1, 2, 3, 4]}) result = df.fillna({'a': 0, 'b': 5}) expected = df.copy() expected['a'] = expected['a'].fillna(0) expected['b'] = expected['b'].fillna(5) assert_frame_equal(result, expected) # it works result = df.fillna({'a': 0, 'b': 5, 'd': 7}) # Series treated same as dict result = df.fillna(df.max()) expected = df.fillna(df.max().to_dict()) assert_frame_equal(result, expected) # disable this for now with pytest.raises(NotImplementedError, match='column by column'): df.fillna(df.max(1), axis=1)
def _extract_wpa(self, document): verbs = set(self._get_verbs(document, with_tags=False)) count_verbs = len(verbs) count_pa_words = len(PA_WORDS) wpa_similarity_frame = DataFrame( np.empty((count_verbs, count_pa_words)), index=verbs, columns=PA_WORDS ) for verb in verbs: for pa_word in PA_WORDS: synset_1 = Synset('{}.v.0'.format(Word(pa_word).lemmatize('v'))) synset_2 = Synset('{}.v.0'.format(Word(verb).lemmatize('v'))) wpa_similarity_frame[pa_word][verb] = synset_2.wup_similarity(synset_1) wpa_max_columns = wpa_similarity_frame.max() return max(wpa_max_columns)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype = float) for n in range(N): sample_tickers = list(random.choice(tickers, sample_size, replace = False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype = float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis = 1) result['Std'] = summary.std(axis = 1) result['Median'] = summary.median(axis = 1) result['Max'] = summary.max(axis = 1) result['Min'] = summary.min(axis = 1) return (result, summary)
def predict(self, prediction_data): preds = DataFrame(prediction_data) col_names = prediction_data.keys() tally_dict = {} for col_name in unique(preds): tally_dict[col_name] = [0 for x in range(preds.shape[0])] for row in preds.iterrows(): index, data = row for col_name, elem in zip(col_names, data): tally_dict[elem][index] += self.weights[col_name] tally_df = DataFrame(tally_dict) max_val = [int(round(x)) for x in tally_df.max(1).tolist()] max_level = [] for row in tally_df.index: int_vals = [int(round(x)) for x in tally_df.ix[row].tolist()] is_max = [x == max_val[row] for x in int_vals] if sum(is_max) > 1: max_level.append(None) else: max_level.append(tally_df.columns[ is_max ][0]) return(max_level)
dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"]) dframe1 # Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance import pandas.io.data as pdweb # import pandas_datareader.data as pdweb import datetime prices = pdweb.get_data_yahoo(
def pca(x, y=None, ylev=None, nlab=0, lsize=10, lalpha=1, center="both", scale="none", legend=True, cname="variable", color=None): if type(color) != type({}): color = None xForSvd = x.ix[:, x.std(axis=0) > 0] xsvd = svdForPca(xForSvd, center, scale) svdRowPlot = DataFrame( xsvd[0][:, 0:2], index = xForSvd.index, columns = ["PC1", "PC2"] ) svdRowPlot = svdRowPlot.divide(svdRowPlot.max(axis=0) - svdRowPlot.min(axis=0), axis=1) svdColPlot = DataFrame( numpy.transpose(xsvd[2][0:2, :]), index = xForSvd.columns, columns = ["PC1", "PC2"] ) svdColPlot = svdColPlot.divide(svdColPlot.max(axis=0) - svdColPlot.min(axis=0), axis=1) if nlab > 0: svdColPlotMag = (svdColPlot**2).sum(axis=1) svdColPlotMag.sort_values(ascending=False, inplace=True) svdColPlot = svdColPlot.ix[svdColPlotMag.index] svdColPlot["label"] = "" svdColPlot.ix[0:nlab, "label"] = \ svdColPlot.ix[0:nlab].index.to_series() if legend: ax = plt.subplot(111) plt.plot(svdColPlot["PC1"], svdColPlot["PC2"], "o", color=(0, 0, 0, 0.1), markersize=5, label=cname) if nlab > 0: for i in range(nlab): plt.text(svdColPlot.ix[i, "PC1"], svdColPlot.ix[i, "PC2"], svdColPlot.ix[i, "label"], fontsize = lsize, color = (0, 0, 0, lalpha), label = None) if y is not None: if ylev is None: ylev = y.unique() for level in ylev: if color is not None and level in color.keys(): plt.plot(svdRowPlot.ix[y == level, 0], svdRowPlot.ix[y == level, 1], "o", markersize = 8, label = level, color = color[level]) else: plt.plot(svdRowPlot.ix[y == level, 0], svdRowPlot.ix[y == level, 1], "o", markersize = 8, label = level) else: plt.plot(svdRowPlot["PC1"], svdRowPlot["PC2"], "o", markersize=8) if legend: box = ax.get_position() ax.set_position([box.x0, box.y0, box.width*0.8, box.height]) ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), numpoints=1) plt.show()
'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' } data['animal'] = data['food'].map(str.lower).map(meat_to_animal) data data['food'].map(lambda x: meat_to_animal[x.lower()]) # 数据标准化 datafile = 'd:/data/normalization_data.xls' #参数初始化 data = pd.read_excel(datafile, header = None) #读取数据 (data - data.min())/(data.max() - data.min()) #最小-最大规范化 (data - data.mean())/data.std() #零-均值规范化 data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 ###替换值 data = Series([1., -999., 2., -999., -1000., 3.]) data data.replace(-999, np.nan) data.replace([-999, -1000], np.nan) data.replace([-999, -1000], [np.nan, 0]) data.replace({-999: np.nan, -1000: 0})
c 36 ''' print 'lambda(匿名函数)以及应用' print frame ''' A B C a 0 1 2 b 3 4 5 c 6 7 8 ''' print frame.max() ''' A 6 B 7 C 8 ''' f = lambda x: x.max() - x.min() print frame.apply(f) # 作用到每一列 ''' A 6 B 6 C 6 ''' print frame.apply(f, axis=1) # 作用到每一行
def _extract_ado_loc_org(self, document): """ PA word has dependent object of PO category (ADO) In a PI post, a purchase action is targeted towards a consumable object. This is reflected in the dependency structure of the text. In a PI post, the consumable object is usually the directly dependent object of the purchase action verb. If there is a PA word in the text and it has a dependent object belonging to a PO category, ADO = 1, otherwise ADO = 0 """ # 1. Identify if there is a PA word. (or a very similar one) # 2. Identify if this PA word has an object. # 3. Identify if this object belongs to the PO category. # 4. If the 3 statements above are true, return ADO = 1 else ADO = 0 s = pattern.en.parsetree(document, relations=True, lemmata=True) ADO = 0 LOC = 0 ORG = 0 # Extract VERBs # Find out if they are ACTION VERBs or not # For each found, find if it has an object which is it's direct dependant # For each object found, find if it belongs to a PO category # For each object found, find it's NER (LOC, ORG) for sentence in s: for chunk in sentence.chunks: if chunk.type == 'VP': print 'Chunk : ', chunk print 'Subject : ', chunk.subject print 'Object : ', chunk.object print 'String : ', chunk.string print 'Tagged : ', chunk.tagged print 'Role : ', chunk.role print 'Relation : ', chunk.relation print 'Related : ', chunk.related # Does it have an object? if chunk.object is not None: # Get the verbs! verbs_and_tags = filter(lambda x: x[1] in VERB_TAGS, chunk.tagged) print 'Verbs : ', verbs_and_tags verbs = [verb[0] for verb in verbs_and_tags] # Are they PA words? count_verbs = len(verbs) count_pa_words = len(PA_WORDS) wpa_similarity_frame = DataFrame( np.empty((count_verbs, count_pa_words)), index=verbs, columns=PA_WORDS ) for verb in verbs: for pa_word in PA_WORDS: synset_1 = Synset('{}.v.0'.format(Word(pa_word).lemmatize('v'))) synset_2 = Synset('{}.v.0'.format(Word(verb).lemmatize('v'))) wpa_similarity_frame[pa_word][verb] = synset_2.wup_similarity(synset_1) wpa_max_columns = wpa_similarity_frame.max() wpa = max(wpa_max_columns) if wpa >= 0.7: # Get the nouns from the object head_noun = chunk.object.head # do they belong to PO category? if head_noun: # check if head belongs to PO Category print 'Head : ', head_noun ADO = 1 # Fix this, implment this actually based on determining if head_noun belongs to PO Category # IMPORTANT: # Try and compile your own list of Consumable and Non-Consumable Categories # as well as the words that belong to them. # Freebase isn't available and Google Knowledge base seems not applicable. print 'Next PP : ', chunk.object.next('PP') if chunk.object.next('PP') is not None: print 'Next NP : ', chunk.object.next('PP').next('NP') word = chunk.object.next('PP').next('NP').head ner_tagged = stanford_tagger.tag([word.string.title()]) print 'NER : ', ner_tagged print print 'NER_LOC_TAGS: ', filter(lambda w: w[1] in NER_LOC_TAGS, ner_tagged) if len(filter(lambda w: w[1] in NER_LOC_TAGS, ner_tagged)) > 0: LOC = 1 else: LOC = 0 print 'NER_ORG_TAGS: ', filter(lambda w: w[1] in NER_ORG_TAGS, ner_tagged) if len(filter(lambda w: w[1] in NER_ORG_TAGS, ner_tagged)) > 0 : ORG = 1 else: ORG = 0 return {'ADO': ADO, 'ORG': ORG, 'LOC': LOC} print print return {'ADO': ADO, 'ORG': ORG, 'LOC': LOC}
# after prepaired data, time to plot it: for new_counter in range(file_counter+1): #print new_counter Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ] x1 = Qbers.index.tolist() y1 = Qbers["Qber"].tolist() x1_average = DataFrame.mean(Qbers)["Qber"] x1_std_dev = DataFrame.std(Qbers)["Qber"] #prepairing proper time: x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1] Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ] x2_average = DataFrame.mean(Raws)["Raw key"] x2_median = DataFrame.median(Raws)["Raw key"] x2_max = DataFrame.max(Raws)["Raw key"] Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)] x2 = Raws.index.tolist() y2 = Raws["Raw key"].tolist() print x2_average #x2_std_dev = 3 #once again correcting counter: x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2] #print x1[0], x2[0], quelle_initialTimestamps[new_counter] # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html f, axarr = plt.subplots(2, sharex=True) axarr[0].grid() axarr[0].plot(x1, y1)
df_app_cat = df_app_cat.sort(columns="avg") # In[286]: plt.plot(df_app_cat["avg"]) # In[287]: plt.plot(df_app_cat["avg"], "bo", df_app_cat["avg"], "k") # In[288]: df_app_cat.max() # In[289]: t1["app_cat_high"] = 0 t2["app_cat_high"] = 0 test["app_cat_high"] = 0 t1["app_cat_high"][t1["app_category"] == "fc6fa53d"] = 1 t2["app_cat_high"][t2["app_category"] == "fc6fa53d"] = 1 test["app_cat_high"][test["app_category"] == "fc6fa53d"] = 1 # In[292]: validation_check2(feature_cols, ["app_cat_high"])