def data_frame(self): if self._processed_knockouts is None: self._process_knockouts() data_frame = DataFrame(self._processed_knockouts) data_frame.sort_values("size", inplace=True) data_frame.index = [i for i in range(len(data_frame))] return data_frame
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr): ''' Meant to feed into a Pivot requested by Mitch Turner. Aggregates the same as above but includes time and product data. ''' dat = pwunsale_tidy['Date'].tolist() pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat] print('Aggregating custom pivot for Mitch.') len_unique = lambda x: len(pd.unique(x)) agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum}, 'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum}, 'Invoice':len_unique } custom_cols = ['Month','CustomerId','Customer','ProductId','Product'] customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False) customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) customer_returns.drop('Customer', inplace=True, axis=1) print('Merging in YTD sales by Customer') customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left') print('Deriving returns as a percent of sales for each Customer.') customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer']) print('Merge in customer attributes.') customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left') print('Sorting in descending order on Dollars returned.') customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True) return customer_returns
def view_coef(model: LogisticRegression, train_df): coef_list = list(model.coef_.T) coef_df = DataFrame({'columns': list(train_df.columns[1:]), 'coef': coef_list}) coef_df['abs_coef'] = abs(coef_df['coef']) coef_df.sort_values(by=['abs_coef'], ascending=[0], inplace=True) print(coef_df)
def compute_importances(data_set_df, user_info_df, label='gender', split_modal=False, n_est=10, max_depth=None): print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_importances = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) modalities = data_set_df.index.levels[0] def compute(x): x_imp = pc.fill_nan_features(x) try: m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \ else ExtraTreesClassifier(n_estimators=n_est, max_depth=3) print "\t\t\tfitting RF model..." m.fit(x_imp.T, y_v) # if len(feature_mics) > 1000: # break # print m.feature_importances_ for order, index in enumerate(x.index): feature_importances.loc[index] = m.feature_importances_[order] if float(order) % 10000 == 0 and order > 0: print "\t\t\t%s features are done" % order except ValueError as e: # print "value error occurs during processing %r" % index pass if split_modal is True: for modal in modalities: x = df_filtered.loc[modal].dropna(how='all') compute(x) else: x = df_filtered.dropna(how='all') compute(x) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1): df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) i = 0 for index, values in df_filtered.iterrows(): try: if min_not_nan < 0: f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v) feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan else: nan_removed = values.dropna() if len(nan_removed) < min_not_nan: feature_fs.loc[index] = np.nan else: f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)]) feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan if float(i) % 10000 == 0 and i > 0: print "\t\t\t%s features are done" % i i += 1 # print index, feature_fs.loc[index].values[0] except ValueError: # print "value error occurs during processing %r" % index continue feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_fs
def setup(self): one_count = 200000 two_count = 1000000 df1 = DataFrame( {'time': np.random.randint(0, one_count / 20, one_count), 'key': np.random.choice(list(string.ascii_uppercase), one_count), 'key2': np.random.randint(0, 25, one_count), 'value1': np.random.randn(one_count)}) df2 = DataFrame( {'time': np.random.randint(0, two_count / 20, two_count), 'key': np.random.choice(list(string.ascii_uppercase), two_count), 'key2': np.random.randint(0, 25, two_count), 'value2': np.random.randn(two_count)}) df1 = df1.sort_values('time') df2 = df2.sort_values('time') df1['time32'] = np.int32(df1.time) df2['time32'] = np.int32(df2.time) self.df1a = df1[['time', 'value1']] self.df2a = df2[['time', 'value2']] self.df1b = df1[['time', 'key', 'value1']] self.df2b = df2[['time', 'key', 'value2']] self.df1c = df1[['time', 'key2', 'value1']] self.df2c = df2[['time', 'key2', 'value2']] self.df1d = df1[['time32', 'value1']] self.df2d = df2[['time32', 'value2']] self.df1e = df1[['time', 'key', 'key2', 'value1']] self.df2e = df2[['time', 'key', 'key2', 'value2']]
def compute_mics(data_set_df, user_info_df, label='gender', min_not_nan=-1): df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_mics = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) i = 0 for index, values in df_filtered.iterrows(): # if len(feature_mics) > 1000: # break m = minepy.MINE() try: if min_not_nan < 0: m.compute_score(values, y_v) feature_mics.loc[index] = m.mic() else: nan_removed = values.dropna() if len(nan_removed) < min_not_nan: feature_mics.loc[index] = np.nan else: m.compute_score(nan_removed, y_v[nan_removed.index.astype(int)]) feature_mics.loc[index] = m.mic() # if len(feature_mics) > 1000: # break # if float(i) % 10000 == 0 and i > 0: # print "\t\t\t%s features are done" % i i += 1 # print index, feature_mics.loc[index].values[0] except ValueError: # print "value error occurs during processing %r" % index continue feature_mics.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_mics
def show_failures_prob(df, y, y_self, out_path): print('show_failures_prob: df=%s,y=%s,y_self=%s,out_path="%s"' % (S(df), S(y), S(y_self), out_path)) name = '%s-%d' % (out_path, len(y)) y_self_bool = DataFrame(np.floor(y_self['hat'].values * 2.0).astype(int), index=y_self.index, columns=['hat']) print('A) y_self_bool=%s,' % C(y_self_bool)) print('B) y=%s' % y.dtype) diff = y_self_bool['hat'] - y failures = diff != 0 print('^' * 80) print(type(failures)) print(failures.describe()) print(failures[:5]) failures_df = Series([False] * len(df), index=df.index) for idx, val in failures.iteritems(): failures_df[idx] = val df = df[failures_df] y_self_df = Series([0.0] * len(df), index=df.index, dtype=float) y_self_df_bool = Series([0] * len(df), index=df.index, dtype=int) for idx in y_self_df.index: y_self_df[idx] = y_self['hat'][idx] y_self_df_bool[idx] = y_self_bool['hat'][idx] df['probability'] = y_self_df df['predicted'] = y_self_df_bool columns = list(df.columns[-3:]) + list(df.columns[:-3]) df2 = DataFrame() for col in columns: df2[col] = df[col] df2.sort_values('hat', ascending=False, inplace=True) df2.to_csv('%s.failures.csv' % name, index_label='job_id')
def test_sort_index_multicolumn(self): import random A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) random.shuffle(A) random.shuffle(B) frame = DataFrame({'A': A, 'B': B, 'C': np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): frame.sort_index(by=['A', 'B']) result = frame.sort_values(by=['A', 'B']) indexer = np.lexsort((frame['B'], frame['A'])) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): frame.sort_index(by=['A', 'B'], ascending=False) result = frame.sort_values(by=['A', 'B'], ascending=False) indexer = np.lexsort((frame['B'].rank(ascending=False), frame['A'].rank(ascending=False))) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): frame.sort_index(by=['B', 'A']) result = frame.sort_values(by=['B', 'A']) indexer = np.lexsort((frame['A'], frame['B'])) expected = frame.take(indexer) assert_frame_equal(result, expected)
def test_sort_datetimes(self): # GH 3461, argsort / lexsort differences for a datetime column df = DataFrame( ["a", "a", "a", "b", "c", "d", "e", "f", "g"], columns=["A"], index=date_range("20130101", periods=9) ) dts = [ Timestamp(x) for x in [ "2004-02-11", "2004-01-21", "2004-01-26", "2005-09-20", "2010-10-04", "2009-05-12", "2008-11-12", "2010-09-28", "2010-09-28", ] ] df["B"] = dts[::2] + dts[1::2] df["C"] = 2.0 df["A1"] = 3.0 df1 = df.sort_values(by="A") df2 = df.sort_values(by=["A"]) assert_frame_equal(df1, df2) df1 = df.sort_values(by="B") df2 = df.sort_values(by=["B"]) assert_frame_equal(df1, df2)
def model_selection_cv( models, x, y, k=5, eval_func=None, random_state=None): """ framework for model selection based on stratified cross-validation Parameters: ---------- * models: {dictionary}, key: model label, value: learner object * x: {np.array}, predictor data * y: {np.array}, response variable data * k: {integer}, the number of folds in cross-validation * random_state: {integer}, the random state set for replication * eval_func: {function}, return evaulation score """ # stratified cross_validation cv = StratifiedKFold( y, n_folds=k, shuffle=False, random_state=random_state) tot_models = len( models.keys() ) tot_iter = tot_models * k pbar = tqdm(total=tot_iter) train_reports, test_reports = [], [] for jj, model_name in enumerate(models): model = models[model_name] # cross-validation evaluation containers train_scores = [] test_scores = [] # print( "--- model: {}'s cross-validation test ----".format( model_name ) ) for ii, (train_idx, test_idx) in enumerate(cv): # retrieve data for relevant usage x_train, y_train = x[train_idx], y[train_idx] x_test, y_test = x[test_idx], y[test_idx] # training model model.fit( x_train, y_train ) # evaluation model train_score = eval_func( model, x_train, y_train ) train_score["model_name"] = model_name test_score = eval_func( model, x_test, y_test ) test_score["model_name"] = model_name train_reports.append( train_score ) test_reports.append( test_score ) pbar.update() pbar.close() # convert list of performance records into dataframe train_reports = DataFrame(train_reports) test_reports = DataFrame(train_reports) metrics_names = [feat for feat in train_reports.columns.tolist() if feat != "model_name"] train_reports.sort_values(by=["model_name"]) train_reports = train_reports[["model_name"] + metrics_names] test_reports.sort_values(by=["model_name"]) test_reports = test_reports[["model_name"] + metrics_names] return train_reports, test_reports
class SortValues(object): params = [True, False] param_names = ['ascending'] def setup(self, ascending): self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) def time_frame_sort_values(self, ascending): self.df.sort_values(by='A', ascending=ascending)
class SortIndexByColumns(object): def setup(self): N = 10000 K = 10 self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), 'key2': tm.makeStringIndex(N).values.repeat(K), 'value': np.random.randn(N * K)}) def time_frame_sort_values_by_columns(self): self.df.sort_values(by=['key1', 'key2'])
def test_stable_descending_multicolumn_sort(self): nan = np.nan df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) # test stable mergesort expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, index=[2, 5, 4, 6, 1, 3, 0]) sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort") assert_frame_equal(sorted_df, expected) expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3]) sorted_df = df.sort_values(["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort") assert_frame_equal(sorted_df, expected)
def getData(self, params): top = int(params['top']) regex = re.compile('^<.*>(\w+.*)</.>') df = createDataframe() source = [str(regex.findall(line)).strip('[]') for line in df['source'] if line != None] source = dict(Counter(source)) appSource = source.keys() count = source.values() tweetSource = DataFrame({'AppSource': appSource, 'Count':count}) tweetSource = tweetSource[['AppSource', 'Count']] tweetSource.sort_values(by='Count', ascending=False, inplace=True) return tweetSource[:top]
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'): # print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered.dropna(how='all') x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values clf = RandomizedLogisticRegression() # print "\t\t\tfitting LR model..." clf.fit(x_imp.T, y_v) feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance']) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def test_timegrouper_get_group(self): # GH 6914 df_original = DataFrame({ 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(), 'Quantity': [18, 3, 5, 1, 9, 3], 'Date': [datetime(2013, 9, 1, 13, 0), datetime(2013, 9, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 3, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 9, 2, 14, 0), ] }) df_reordered = df_original.sort_values(by='Quantity') # single grouping expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], df_original.iloc[[4]]] dt_list = ['2013-09-30', '2013-10-31', '2013-12-31'] for df in [df_original, df_reordered]: grouped = df.groupby(pd.Grouper(freq='M', key='Date')) for t, expected in zip(dt_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group(dt) assert_frame_equal(result, expected) # multiple grouping expected_list = [df_original.iloc[[1]], df_original.iloc[[3]], df_original.iloc[[4]]] g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), ('Joe', '2013-12-31')] for df in [df_original, df_reordered]: grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')]) for (b, t), expected in zip(g_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group((b, dt)) assert_frame_equal(result, expected) # with index df_original = df_original.set_index('Date') df_reordered = df_original.sort_values(by='Quantity') expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], df_original.iloc[[4]]] for df in [df_original, df_reordered]: grouped = df.groupby(pd.Grouper(freq='M')) for t, expected in zip(dt_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group(dt) assert_frame_equal(result, expected)
def test_sort_index_duplicates(self): # with 9816, these are all translated to .sort_values df = DataFrame([lrange(5, 9), lrange(4)], columns=['a', 'a', 'b', 'b']) with assertRaisesRegexp(ValueError, 'duplicate'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by='a') with assertRaisesRegexp(ValueError, 'duplicate'): df.sort_values(by='a') with assertRaisesRegexp(ValueError, 'duplicate'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=['a']) with assertRaisesRegexp(ValueError, 'duplicate'): df.sort_values(by=['a']) with assertRaisesRegexp(ValueError, 'duplicate'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): # multi-column 'by' is separate codepath df.sort_index(by=['a', 'b']) with assertRaisesRegexp(ValueError, 'duplicate'): # multi-column 'by' is separate codepath df.sort_values(by=['a', 'b']) # with multi-index # GH4370 df = DataFrame(np.random.randn(4, 2), columns=MultiIndex.from_tuples([('a', 0), ('a', 1)])) with assertRaisesRegexp(ValueError, 'levels'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by='a') with assertRaisesRegexp(ValueError, 'levels'): df.sort_values(by='a') # convert tuples to a list of tuples # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=[('a', 1)]) expected = df.sort_values(by=[('a', 1)]) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=('a', 1)) result = df.sort_values(by=('a', 1)) assert_frame_equal(result, expected)
def show_predicted_prob(df, y_test, out_path): print('show_predicted_prob: df=%s,y_test=%s,out_path="%s"' % (S(df), S(y_test), out_path)) name = '%s-%d' % (out_path, len(y_test)) print('~' * 80) df = df.loc[y_test.index, :] df['hat'] = y_test columns = ['hat'] + [col for col in df.columns if col != 'hat'] df2 = DataFrame() for col in columns: df2[col] = df[col] df2.sort_values('hat', ascending=False, inplace=True) df2.to_csv('%s.predicted.csv' % name, index_label='job_id')
def parse_ticker_dataframe(ticker: list) -> DataFrame: """ Analyses the trend for the given ticker history :param ticker: See exchange.get_ticker_history :return: DataFrame """ columns = {'C': 'close', 'V': 'volume', 'O': 'open', 'H': 'high', 'L': 'low', 'T': 'date'} frame = DataFrame(ticker) \ .drop('BV', 1) \ .rename(columns=columns) frame['date'] = to_datetime(frame['date'], utc=True, infer_datetime_format=True) frame.sort_values('date', inplace=True) return frame
def _search_by_inchi_fuzzy(self, inchi): # TODO: use openbabel if available matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8) ranks = {match: i for i, match in enumerate(matches)} selection = DataFrame(self.data_frame[self.data_frame.InChI.isin(matches)]) selection['search_rank'] = selection.name.map(ranks) return selection.sort_values('search_rank')
def data_frame(self): if self._processed_solutions is None: self._process_solutions() if self._manipulation_type == "reactions": data_frame = DataFrame(self._processed_solutions) else: columns = self._processed_solutions.columns.difference(["reactions", "size"]) aggregation_functions = {k: self.__aggregation_function.get(k, lambda x: x.values[0]) for k in columns} data_frame = self._processed_solutions.groupby(["reactions", "size"], as_index=False) \ .aggregate(aggregation_functions) data_frame = data_frame[self._processed_solutions.columns] data_frame.sort_values("size", inplace=True) data_frame.index = [i for i in range(len(data_frame))] return data_frame
def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({'A': A, 'B': B, 'C': np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=['A', 'B'], ascending=[1, 0]) result = df.sort_values(by=['A', 'B'], ascending=[1, 0]) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) assert_frame_equal(result, expected) # test with multiindex, too idf = df.set_index(['A', 'B']) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) assert_frame_equal(result, expected) # also, Series! result = idf['C'].sort_index(ascending=[1, 0]) assert_series_equal(result, expected['C'])
def thread_participation_evolution( pm_frame, project, n=2, skip_anon=True, research_only=False): """Assembles data on participation to threads in project with n as thresh. Returns DataFrame, index, selection and title for data for use by stacked bar-plot and heatmap functions.""" if not research_only: thread_type = 'all threads' title = "Participation per thread in {} (threshold = {})".format( project, n) else: thread_type = 'research threads' title = "Participation per thread in {}\ (threshold = {}, only research-threads)".format(project, n) data = pm_frame.loc[project][['basic', thread_type]] data = data.dropna() all_authors = set().union(*data[thread_type, 'authors']) author_thread = DataFrame(columns=all_authors) for author in author_thread.columns: author_thread[author] = data[thread_type, 'authors'].apply( lambda thread, author=author: author in thread) author_thread = author_thread.T author_thread = author_thread.sort_values(by=data.index.tolist(), ascending=False) author_thread = author_thread.drop( "Anonymous") if skip_anon else author_thread author_thread.columns.name = "Threads" select = author_thread.sum(axis=1) >= n return author_thread, data.index, select, title
def test_stable_descending_sort(self): # GH #6399 df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], columns=['sort_col', 'order']) sorted_df = df.sort_values(by='sort_col', kind='mergesort', ascending=False) assert_frame_equal(df, sorted_df)
def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) df = DataFrame({'value': value}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = (r"could not convert string to float|" r"invalid literal for float\(\)") with pytest.raises(ValueError, match=msg): s.astype('float64') cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal( np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name='value_group') cmp(s.astype('object'), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion for valid in [lambda x: x.astype('category'), lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype( CategoricalDtype()) ]: result = valid(s) # compare series values # internal .categories can't be compared because it is sorted tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\." "Categorical'> for astype") for invalid in [lambda x: x.astype(Categorical), lambda x: x.astype('object').astype(Categorical)]: with pytest.raises(TypeError, match=msg): invalid(s)
def land_sic_overlap_timeseries(instrument, title="Land-Sea Ice Border Variations"): """ Time Series that shows the percentage variations of the land mask border given the expansion of sea ice in VIRS. """ files = data.file_names(instrument_id=data.INSTRUMENT_MAP.get(instrument)) out = [] for idx, mat in enumerate(data.mat_generator(files)): sic = SIC(files[idx]) lm = LM(files[idx]) sic_surface = sic.surface(boolean=False) lm_surface = lm.silhoutte() silhoutte_freq = itemfreq(lm_surface) border = silhoutte_freq[1][1] merge = np.add(sic_surface, lm_surface) merge_freq = itemfreq(merge) intercept = merge_freq[2][1] land_ice_overlap = (float(intercept) / border) * 100 temp = {'timestamp': lm.title, 'intercept': land_ice_overlap} out.append(temp) index = [elem['timestamp'] for elem in out] df = DataFrame(out, index=index) sdf = df.sort_values(by='timestamp') sdf.plot(title=title) plt.show()
def plot_stuff(): pd_list = {} compare_tl = [] compare_tl_head = [] for vars in list_communities(): for var in vars: pd_list.update({var.split("/")[-1].split(".")[0]: DataFrame( sorted(read_csv(var)[["Name", "G"]].values, key=lambda x: x[0], reverse=True))}) N = len(pd_list.keys()) # Find number of elements stats = np.zeros((N, N)) # Create a 2-D Array to hold the stats keys = sorted(pd_list, reverse=True) # Find data sets (Sort alphabetically, backwards) for idx, key in enumerate(keys): # Populate 2-D array for i, val in enumerate(pd_list[key][1].values): if not i == idx: # Ensure self values are set to zero stats[i, idx] = val stats = DataFrame(stats, columns=keys, index=keys) # stats["Mean"] = stats.median(axis=0) # set_trace() stats["Mean"] = find_mean(stats) stats["Std"] = find_std(stats) stats = stats.sort_values(by="Mean", axis=0, ascending=False, inplace=False) print(tabulate(stats, showindex=True, headers=stats.columns, tablefmt="fancy_grid")) print("\n") save_path = os.path.abspath("/".join(var.split("/")[:-2])) method = var.split("/")[-2]+".xlsx" stats.to_excel(os.path.join(save_path, method)) compare_tl.append(stats.sort_index(inplace=False)["Mean"].values.tolist()) compare_tl_head.append(method) # set_trace() compare_tl= DataFrame(np.array(compare_tl).T, columns=compare_tl_head, index=stats.index.sort_values()) save_path_2 = os.path.join(os.path.abspath("/".join(var.split("/")[:-3])), os.path.abspath("".join(var.split("/")[-3]))+".xlsx") compare_tl.to_excel(save_path_2)
def test_numeric_like_ops(self): df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) # numeric ops should not succeed for op in ['__add__', '__sub__', '__mul__', '__truediv__']: pytest.raises(TypeError, lambda: getattr(df, op)(df)) # reduction ops should not succeed (unless specifically defined, e.g. # min/max) s = df['value_group'] for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']: pytest.raises(TypeError, lambda: getattr(s, op)(numeric_only=False)) # mad technically works because it takes always the numeric data # numpy ops s = Series(Categorical([1, 2, 3, 4])) pytest.raises(TypeError, lambda: np.sum(s)) # numeric ops on a Series for op in ['__add__', '__sub__', '__mul__', '__truediv__']: pytest.raises(TypeError, lambda: getattr(s, op)(2)) # invalid ufunc pytest.raises(TypeError, lambda: np.log(s))
def project_participation_evolution( pm_frame, all_authors, n=2, skip_anon=True, research_only=False): """Assembles data on participation to projects with n as thresh. Returns DataFrame, index, selection and title for data for use by stacked bar-plot and heatmap functions.""" if not research_only: thread_type = 'all threads' data, _ = get_last(pm_frame, thread_type) all_authors = list(all_authors) title = "Participation per project in Polymath\ (threshold = {})".format(n) else: thread_type = 'research threads' data, _ = get_last(pm_frame, thread_type) all_authors = set().union( *data['research threads', 'authors (accumulated)']) title = "Participation per project in Polymath\ (threshold = {}, only research-threads)".format(n) data.index = data.index.droplevel(1) author_project = DataFrame(columns=all_authors) for author in author_project.columns: author_project[author] = data[ thread_type, 'authors (accumulated)'].apply( lambda project, author=author: author in project) author_project = author_project.T author_project = author_project.sort_values(by=data.index.tolist(), ascending=False) author_project = author_project.drop( "Anonymous") if skip_anon else author_project select = author_project.sum(axis=1) >= n return author_project, data.index, select, title
def group_by_booking_date(dataframe: pd.DataFrame): key = lambda k: (k.year, k.month, k.day) dataframe_sort_creation = dataframe.sort_values(by='creation_date', ascending=True) # new Frame of data d to leave the original Frame of data the same print(dataframe_sort_creation.groupby(dataframe_sort_creation['creation_date'].apply(key)).mean()['amount']) print(dataframe.groupby(dataframe['booking_date'].apply(key)))
def test_left_join_index_multi_match_multiindex(self): left = DataFrame( [ ["X", "Y", "C", "a"], ["W", "Y", "C", "e"], ["V", "Q", "A", "h"], ["V", "R", "D", "i"], ["X", "Y", "D", "b"], ["X", "Y", "A", "c"], ["W", "Q", "B", "f"], ["W", "R", "C", "g"], ["V", "Y", "C", "j"], ["X", "Y", "B", "d"], ], columns=["cola", "colb", "colc", "tag"], index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8], ) right = DataFrame( [ ["W", "R", "C", 0], ["W", "Q", "B", 3], ["W", "Q", "B", 8], ["X", "Y", "A", 1], ["X", "Y", "A", 4], ["X", "Y", "B", 5], ["X", "Y", "C", 6], ["X", "Y", "C", 9], ["X", "Q", "C", -6], ["X", "R", "C", -9], ["V", "Y", "C", 7], ["V", "R", "D", 2], ["V", "R", "D", -1], ["V", "Q", "A", -3], ], columns=["col1", "col2", "col3", "val"], ).set_index(["col1", "col2", "col3"]) result = left.join(right, on=["cola", "colb", "colc"], how="left") expected = DataFrame( [ ["X", "Y", "C", "a", 6], ["X", "Y", "C", "a", 9], ["W", "Y", "C", "e", np.nan], ["V", "Q", "A", "h", -3], ["V", "R", "D", "i", 2], ["V", "R", "D", "i", -1], ["X", "Y", "D", "b", np.nan], ["X", "Y", "A", "c", 1], ["X", "Y", "A", "c", 4], ["W", "Q", "B", "f", 3], ["W", "Q", "B", "f", 8], ["W", "R", "C", "g", 0], ["V", "Y", "C", "j", 7], ["X", "Y", "B", "d", 5], ], columns=["cola", "colb", "colc", "tag", "val"], index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8], ) tm.assert_frame_equal(result, expected) result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True) expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort") tm.assert_frame_equal(result, expected)
def constructEdataFromDataFrame( df: pd.DataFrame, model: AmiciModel, condition: pd.Series, by_id: Optional[bool] = False) -> amici.amici.ExpData: """ Constructs an ExpData instance according to the provided Model and DataFrame. :param df: pd.DataFrame with Observable Names/Ids as columns. Standard deviations may be specified by appending '_std' as suffix. :param model: Model instance. :param condition: pd.Series with FixedParameter Names/Ids as columns. Preequilibration conditions may be specified by appending '_preeq' as suffix. Presimulation conditions may be specified by appending '_presim' as suffix. :param by_id: Indicate whether in the arguments, column headers are based on ids or names. This should correspond to the way `df` and `condition` was created in the first place. :return: ExpData instance. """ # initialize edata edata = amici.ExpData(model.get()) # timepoints df = df.sort_values(by='time', ascending=True) edata.setTimepoints(df['time'].values.astype(float)) # get fixed parameters from condition overwrite_preeq = {} overwrite_presim = {} for par in list(_get_names_or_ids(model, 'FixedParameter', by_id=by_id)): if par + '_preeq' in condition.keys() \ and not math.isnan(condition[par + '_preeq'].astype(float)): overwrite_preeq[par] = condition[par + '_preeq'].astype(float) if par + '_presim' in condition.keys() \ and not math.isnan(condition[par + '_presim'].astype(float)): overwrite_presim[par] = condition[par + '_presim'].astype(float) # fill in fixed parameters edata.fixedParameters = condition[_get_names_or_ids( model, 'FixedParameter', by_id=by_id)].astype(float).values # fill in preequilibration parameters if any([ overwrite_preeq[key] != condition[key] for key in overwrite_preeq.keys() ]): edata.fixedParametersPreequilibration = \ _get_specialized_fixed_parameters( model, condition, overwrite_preeq, by_id=by_id) elif len(overwrite_preeq.keys()): edata.fixedParametersPreequilibration = copy.deepcopy( edata.fixedParameters) # fill in presimulation parameters if any([ overwrite_presim[key] != condition[key] for key in overwrite_presim.keys() ]): edata.fixedParametersPresimulation = _get_specialized_fixed_parameters( model, condition, overwrite_presim, by_id=by_id) elif len(overwrite_presim.keys()): edata.fixedParametersPresimulation = copy.deepcopy( edata.fixedParameters) # fill in presimulation time if 't_presim' in condition.keys(): edata.t_presim = float(condition['t_presim']) # fill in data and stds for obs_index, obs in enumerate( _get_names_or_ids(model, 'Observable', by_id=by_id)): if obs in df.keys(): edata.setObservedData(df[obs].values.astype(float), obs_index) if obs + '_std' in df.keys(): edata.setObservedDataStdDev(df[obs + '_std'].values.astype(float), obs_index) return edata
class ParamGA(object): def __init__(self, QID_trn, X_trn, Y_trn, QID_dev, X_dev, Y_dev, model_name='RandomForestClassifier', param_funcs=dict(), param_static=dict(), param_index=dict(), pair_wise=False): self.QID_trn, self.X_trn, self.Y_trn = QID_trn, X_trn, Y_trn self.QID_dev, self.X_dev, self.Y_dev = QID_dev, X_dev, Y_dev self.rank_dev = DataFrame({}, index=self.X_dev.index) self.rank_dev['Label'] = self.Y_dev self.rank_dev['QID'] = self.QID_dev self.counter = itertools.count() self.mrrs = dict() self.model_name = model_name self.param_funcs = param_funcs self.param_static = param_static self.param_index = param_index self.pair_wise = pair_wise if self.pair_wise: self.pair_ranker = PairWiseRanker(self.QID_trn, self.X_trn, self.Y_trn) self.pair_ranker.init_predict(self.QID_dev, self.X_dev) else: self.models = dict() def _gen_param(self): param = [None] * len(self.param_index) for pn, func in self.param_funcs.items(): param[self.param_index[pn]] = func['gen']() return param def _evaluate(self, indiv): eval_str = self.model_name + '(' flag = '' for pn in self.param_funcs: gene = indiv[self.param_index[pn]] if type(gene) == str: eval_str += flag + pn + '="' + str(gene) + '"' else: eval_str += flag + pn + '=' + str(gene) flag = ', ' for pn in self.param_static: if type(self.param_static[pn]) == str: eval_str += flag + pn + '="' + str(self.param_static[pn]) + '"' else: eval_str += flag + pn + '=' + str(self.param_static[pn]) flag = ', ' eval_str += ')' model = eval(eval_str) model_idx = self.counter.next() if self.pair_wise: self.pair_ranker.fit(model, model_idx) pred = self.pair_ranker.do_predict(model_idx) else: model.fit(self.X_trn, self.Y_trn) self.models[model_idx] = model pred = Series(model.predict_proba(self.X_dev)[:, 1], index=self.X_dev.index) self.rank_dev['pred'] = pred # pred must be a Series, not an array self.rank_dev.sort_values(['QID', 'pred'], inplace=True, ascending=False) grp = self.rank_dev.Label.groupby(self.rank_dev.QID) mrr = MRR(grp, keep_no_ans=False) self.mrrs[model_idx] = mrr print ' >', model_idx, np.round(mrr, 4), indiv return mrr, def _mut_indiv(self, indiv, indiv_pb): for pn, func in self.param_funcs.items(): if random.random() < indiv_pb: indiv[self.param_index[pn]] = func['mut']() def run(self, NPOP=30, NGEN=10, CXPB=0.5, MUTPB=0.2): self.ga = MyGA(self._gen_param, self._evaluate, self._mut_indiv, CXPB=CXPB, MUTPB=MUTPB) self.ga.init_pop(NPOP=NPOP) self.ga.iterate(NGEN=NGEN)
# 分析浏览次数7次以上的数据 times = counts1_.index[7:] bins = [7, 100, 1000, 50000] cats = pd.cut(times, bins, right=True, labels=['8~100', '101~1000', '1000以上']) e = cats.value_counts() e = DataFrame(e, columns=[u'用户数']) e.index.name = u'点击次数' # In[22]: e[u'用户数'] = np.nan e.ix[u'8~100', u'用户数'] = a.loc[8:100, :][u'用户数'].sum() e.ix['101~1000', u'用户数'] = a.loc[101:1000, :][u'用户数'].sum() e.ix['1000以上', u'用户数'] = a.loc[1001:, :][u'用户数'].sum() e.sort_values(by=u'用户数', ascending=False, inplace=True) e.reset_index(inplace=True) e # In[23]: #-----* 3 *-----对浏览一次的用户行为进行分析 # 读取数据库数据 engine = create_engine( 'mysql+pymysql://root:@127.0.0.1:3306/jing?charset=utf8') sql = pd.read_sql('all_gzdata', engine, chunksize=10000) # In[24]:
df = DataFrame(obs) num = ['min_exec', 'average', 'max_exec'] for c in num: df[c] /= df['N'] df.head() ########################## # Let's compute the gains. gains = [] for nb in set(df['nb']): gain = parallized_gain(df[df.nb == nb]) gains.append(dict(nb=nb, gain=gain)) dfg = DataFrame(gains) dfg = dfg.sort_values('nb').reset_index(drop=True).copy() dfg ########################################## # Graph. ax = dfg.set_index('nb').plot() ax.set_title( "Parallelization gain depending\non the number of trees\n(max_depth=6).") ############################## # That does not answer the question we are looking for # as we would like to know the best threshold *th* # which defines the number of observations for which # we should parallelized. This number depends on the number # of trees. A gain > 1 means the parallization should happen
def input_value () : mode = input("naver or daum ?") database = input("Database ? ") start_year = str(input("Start Year ? ")) start_month = str(input("Start Month ? ")) start_day = str(input("Start Day ? ")) end_year = str(input("End Year ? ")) end_month = str(input("End Month ? ")) end_day = str(input("End Day ? ")) reply_num = int(input("댓글 수 몇 개 이상 ? ")) mode_database = mode+ '_' + database conn = db.make_connect(mode_database) news = db.read_by_table(mode + "_articles") reply = db.read_by_table(mode + "_replies") news_df = DataFrame(news) reply_df = DataFrame(reply) news_df.sort_values(by = ['article_date'], axis = 0, inplace = True) news_df = news_df[news_df.article_date != '-'] news_df = news_df.dropna() if(mode == 'naver') : news_df['article_date'] = news_df['article_date'].str.replace('최종수정 ', '') news_df['article_date'] = news_df['article_date'].str.replace('.', '-') start_date = start_year + '-' + start_month + '-' + start_day end_date = end_year + '-' + end_month + '-' + end_day elif(mode == 'daum') : news_df['article_date'] = news_df['article_date'].str.replace('수정 ', '') news_df['article_date'] = news_df['article_date'].str.replace('입력 ', '') start_date = start_year + '.' + start_month + '.' + start_day end_date = end_year + '.' + end_month + '.' + end_day news_df = news_df[news_df.article_date > start_date] news_df = news_df[news_df.article_date < end_date] reply_df['R_Like+Bad'] = reply_df['R_Like'] + reply_df['R_Bad'] reply_df['reply_date'] = reply_df['reply_date'].apply(lambda e: e[:16]) group_reply_df = reply_df.groupby('Article_ID').size().to_frame('R_count') group_reply_df = group_reply_df[group_reply_df['R_count'] > reply_num] group_reply_df = group_reply_df.reset_index() news_df = news_df[news_df['Article_ID'].isin(group_reply_df['Article_ID'])] temp_reply_df = DataFrame({'Article_ID': reply_df['Article_ID'],'R_Like': reply_df['R_Like'], 'R_Bad': reply_df['R_Bad'], 'R_Like+Bad': reply_df['R_Like+Bad']}) temp_reply_df = temp_reply_df.groupby('Article_ID').sum() temp_news_df = DataFrame({'Article_ID': news_df['Article_ID'],'Title': news_df['Title'], 'article_date' : news_df['article_date']}) result_df = pd.merge(temp_news_df, group_reply_df, on = 'Article_ID') rank_value = int(input("상위 랭크 ? ")) result_df = pd.merge(result_df, temp_reply_df, on = 'Article_ID') # Like+Bad 순으로 정렬 후 순위만큼 출력 result_df = result_df.sort_values(by = ['R_Like+Bad'], ascending=False) print('댓글 수가 '+ str(reply_num) +' 개 이상인 기사들의 총 개수 : ' + str(len(result_df))) print(result_df[0:rank_value]) return result_df,reply_df
covid_pcr.append(result2) elif (result1 is not None and result2 is None): covid_pcr.append(result1) print len(covid_pcr) inputDF['covid_pcr'] = covid_pcr # threshold = len(inputDF)*0.7 print "before drop " + str(len(inputDF.columns.values)) inputDF = inputDF.dropna(axis=1, how='all') print "after drop " + str(len(inputDF.columns.values)) inputDF = inputDF.dropna(axis=0, how='any', thresh=40) threshold = len(inputDF) * 0.7 inputDF = inputDF.dropna(axis=1, how='any', thresh=threshold) percent_missing = inputDF.isnull().sum() * 100 / len(inputDF) missing_value_df = DataFrame({ 'column_name': inputDF.columns, 'percent_missing': percent_missing }) missing_value_df.sort_values('percent_missing', inplace=True) inputDF = inputDF.dropna(axis=0, how='any', thresh=len(inputDF.columns.values) * 0.8) inputDF.to_sql('treated_dataset', con=dbconn.connection, index=False)
''' D C B A b 2 3 1 0 a 6 7 5 4 ''' print 'DataFrame按列的值排序' frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]}) print frame ''' a b 0 0 4 1 1 7 2 0 -3 3 1 2 ''' print frame.sort_values(by = 'b') # 指定b这列的值进行排序 ''' a b 2 0 -3 3 1 2 0 0 4 1 1 7 ''' print frame.sort_values(by = ['a', 'b']) #先a后b进行列的值排序 ''' a b 2 0 -3 0 0 4 3 1 2 1 1 7 '''
def main(offset=0): daily001 = main_session.query(models.DailyPro).filter( models.DailyPro.ts_code == '000001.SZ').order_by( models.DailyPro.trade_date.desc()).all() LAST_MARKET_DATE = daily001[offset].trade_date data_frame = DataFrame() for i, stock_basic in enumerate( main_session.query(models.StockBasicPro).all()): try: for key in models.StockBasicPro.keys: data_frame.loc[i, key] = getattr(stock_basic, key) daily = main_session.query(models.DailyPro).filter( models.DailyPro.ts_code == stock_basic.ts_code, models.DailyPro.trade_date <= LAST_MARKET_DATE).order_by( models.DailyPro.trade_date.desc()).limit( sampling_count).all() ma_10 = api.daily_close_ma(daily=daily, step=10) ma_20 = api.daily_close_ma(daily=daily, step=20) data_frame.loc[i, COL_MA_10] = ma_10[0] data_frame.loc[i, COL_MA_20] = ma_20[0] data_frame.loc[i, COL_MA_10_SLOPE] = round( (ma_10[0] / ma_10[1] - 1) * 100, 2) data_frame.loc[i, COL_MA_20_SLOPE] = round( (ma_20[0] / ma_20[1] - 1) * 100, 2) data_frame.loc[i, COL_LASTPRICE] = daily[0].close data_frame.loc[i, COL_INDAY_CHG] = round( daily[0].close - daily[0].open, 2) cons = main_session.query(models.ConceptPro).join( models.ConceptDetailPro, models.ConceptPro.code == models.ConceptDetailPro.code).filter( models.ConceptDetailPro.ts_code == stock_basic.ts_code).all() concept_value = '' for con in cons: concept_value = concept_value + '{c}, '.format(c=con.name) data_frame.loc[i, 'concept'] = concept_value daily_basic = main_session.query(models.DailyBasicPro).filter( models.DailyBasicPro.ts_code == stock_basic.ts_code).first() if daily_basic: data_frame.loc[i, 'circ_mv'] = '{}亿'.format( round(daily_basic.circ_mv / 10000, 2)) except Exception as e: print('excetion in index:{index} {code} {name}'.format( index=i, code=stock_basic.ts_code, name=stock_basic.name)) continue print('##### {i} #####'.format(i=i)) data_frame = data_frame[(data_frame[COL_MA_10] > data_frame[COL_MA_20]) & (data_frame[COL_LASTPRICE] < data_frame[COL_MA_10]) & (data_frame[COL_INDAY_CHG] > 0)] # data_frame = data_frame.sort_values(by=COL_MAXGAP, ascending=False).reset_index(drop=True) # data_frame = data_frame.iloc[:200] data_frame = data_frame.sort_values(by=COL_MA_20_SLOPE, ascending=False).reset_index(drop=True) data_frame = data_frame.loc[:, [ 'ts_code', 'name', 'industry', COL_LASTPRICE, 'concept', 'circ_mv' ]] file_name = '../../logs/{date}@MA_10_20.csv'.format(date=LAST_MARKET_DATE) # print(fileName) with open(file_name, 'w', encoding='utf8') as file: data_frame.to_csv(file)
type(frame.e) obj = pd.Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame.sort_index() frame.sort_index(axis=1) frame.sort_index(axis=1, ascending=False) obj = pd.Series([4, 7, -3, 2]) obj.sort_values() obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) obj.sort_values() frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame frame.sort_values(by='b') frame.sort_values(by=['a','b']) obj = pd.Series([7, -5, 7, 4, 2, 0, 4]) obj.rank() obj.rank(method='first') obj.rank(ascending=False, method='max') # method is tie-breaking method frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) frame frame.rank(axis='columns') obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c']) obj obj.index.is_unique
def extract_preds_from_test_set(gen, model, reset_data=None): """ Run a model on specified number of batches extracted from a generator object. :param gen: generator object that returns a batch of (test set samples, test set targets). Generator should come from train.gen_seq_scans(..., test_set=True) so that batches are created sequentially. :param model: keras.models.Sequential model object :param reset_data: pandas.DataFrame with data from which generator came from, use to determine when to reset model states. Use only with a stateful model. :return: y, preds 2-tuple of arrays, the original target array (y) and the model predicted array (preds) """ y = list() preds = list() idx = list() proceed = True batch_ctr = 0 prev_end_idx = None while proceed: if (batch_ctr % 10 == 0) and (batch_ctr > 0): print('batch number {0}, total predictions made = {1}'.format( batch_ctr, len(y))) # if reset_every is an integer, reset model every n batches. x_batch, y_batch, idx_batch = next(gen) # if using a stateful model and there's a discontinuity between consecutive batches, reset model. if (reset_data is not None) and (batch_ctr > 0): batch_step = reset_data.pos.iloc[ idx_batch[0]] - reset_data.pos.iloc[prev_end_idx] prev_end_idx = idx_batch[-1] if batch_step != 1: print('batch discontinuity found. resetting model state.') model.reset_states() # store end index from current batch. else: prev_end_idx = idx_batch[-1] preds_batch = model.predict(x_batch, batch_size=x_batch.shape[0]) y += y_batch.ravel().tolist() preds += preds_batch.ravel().tolist() idx += idx_batch # check whether or not any batch indices are repeated. If they are, # this means that batches have reset to the beginning of the test set # as the entire test set has been covered. proceed = len(set(idx)) == len(idx) batch_ctr += 1 # remove duplicate predictions and arrange predictions in genome order. preds_df = DataFrame({'y': y, 'preds': preds, 'idx': idx}) preds_df.drop_duplicates(subset=['idx'], inplace=True) preds_df.sort_values(['idx'], inplace=True) preds_df.reset_index(drop=True, inplace=True) return preds_df
def summary(self, metadata=None, lower=None, upper=None, sort=[]): """Summarize the NEPC model. Prints the following information: - Number of cross sections in the model - Number of cross sections matching metadata, if provided Returns a stylized Pandas dataframe with headers given by: headers = ["cs_id", "specie", "lhsA", "rhsA", "process", "reaction", "threshold", "E_peak", "E_upper", "sigma_max", "lpu", "upu"] Parameters ---------- metadata: dict see :attr:`.CS.metadata` lower : int lower bound of model index to include in summary upper : int upper bound of model index to include in summary sort : list[str] headers by which the stylized Pandas table is sorted Returns ------- cs_df : pandas.io.formats.style.Styler A stylized Pandas DataFrame containing the cs_id, process, range of electron energies (E_lower, E_upper), maximum sigma (sigma_max), and lpu/upu's for each cross section in the model (or subset of the model if :obj:`metadata` is provided) """ summary_list = [] headers = ["cs_id", "specie", "lhsA", "rhsA", "process", "reaction", "threshold", "E_peak", "E_upper", "sigma_max", "lpu", "upu"] max_e_peak = 0 min_e_peak = 100000 max_e_upper = 0 max_peak_sigma = 0 min_peak_sigma = 1 max_lpu = 0.000000001 max_upu = 0.000000001 print('Number of cross sections in model: {:d}'.format(len(self.cs))) if metadata is not None: cs_subset = self.subset(metadata=metadata) print('Number of cross sections with ' 'matching metadata: {:d}'.format(len(cs_subset))) else: cs_subset = self.cs for cs in cs_subset: csdata = np.array(list(zip(cs.data['e'], cs.data['sigma']))) e_peak = csdata[np.argmax(csdata[:, 1]), 0] cs_peak_sigma = np.max(csdata[:, 1]) e_upper = np.max(csdata[csdata[:, 1] != 0.0][:, 0]) if e_peak > max_e_peak: max_e_peak = e_peak if e_peak < min_e_peak: min_e_peak = e_peak if e_upper > max_e_upper: max_e_upper = e_upper if cs_peak_sigma > max_peak_sigma: max_peak_sigma = cs_peak_sigma if cs_peak_sigma < min_peak_sigma: min_peak_sigma = cs_peak_sigma reaction = reaction_latex(cs) cs_lpu = cs.metadata["lpu"] cs_upu = cs.metadata["upu"] if cs_lpu is not None and cs_lpu > max_lpu: max_lpu = cs_lpu if cs_upu is not None and cs_upu > max_upu: max_upu = cs_upu summary_list.append([cs.metadata["cs_id"], cs.metadata["specie"], cs.metadata["lhsA"], cs.metadata["rhsA"], cs.metadata["process"], reaction, cs.metadata["units_e"]*cs.metadata["threshold"], cs.metadata["units_e"]*e_peak, cs.metadata["units_e"]*e_upper, cs.metadata["units_sigma"]*cs_peak_sigma, cs_lpu, cs_upu]) cs_df = DataFrame(summary_list, columns=headers) if sort: cs_df = (cs_df.sort_values(by=sort) .reset_index(drop=True)) if upper is None: upper = len(cs_df) if lower is None: lower = 0 return (cs_df.loc[lower:upper] .style .background_gradient(subset=['threshold', 'E_peak', 'E_upper', 'sigma_max', 'lpu', 'upu'], cmap='plasma') .highlight_null('red'))
def assert_frame_equal(cls, left: pd.DataFrame, right: pd.DataFrame, *args: Any, **kwargs: Any) -> None: columns = list(set(left.columns) & set(right.columns)) left = left.sort_values(by=columns) right = right.sort_values(by=columns) return super().assert_frame_equal(left, right, *args, **kwargs)
from pandas import Series, DataFrame data = { '语文': [66, 95, 98, 90, 80], '数学': [65, 76, 86, 88, 90], '英语': [30, 98, 88, 77, 90] } df = DataFrame(data, index=['张飞', '关羽', '刘备', '典韦', '许褚'], columns=['语文', '数学', '英语']) df1 = DataFrame(data, index=['张飞', '关羽', '刘备', '典韦', '许褚'], columns=['语文', '数学', '英语', '总计']) df1['总计'] = df1.sum(axis=1) print('平均分\n', df.mean()) print("============================================") print('最小成绩\n', df.min()) print("============================================") print('最大成绩\n', df.max()) print("============================================") print('方差\n', df.var()) print("============================================") print('标准差\n', df.std()) print("============================================") #输出按总成绩倒序排名 print('总成绩排名如下:') print(df1.sort_values('总计', ascending=False)) # In[ ]:
def arrange(filename, savename): print('글 정렬 작업 준비중...') sys.stdout.flush() from pandas import DataFrame, read_csv, concat if not ('.csv' in filename): filename += '.csv' # 통피 리스트 SKTip = [ '203.226', '211.234', '223.32', '223.33', '223.34', '223.35', '223.36', '223.37', '223.38', '223.39', '223.40', '223.41', '223.42', '223.43', '223.44', '223.45', '223.46', '223.47', '223.48', '223.49', '223.50', '223.51', '223.52', '223.53', '223.54', '223.55', '223.56', '223.57', '223.58', '223.59', '223.60', '223.61', '223.62', '223.63', '27.160', '27.161', '27.162', '27.163', '27.164', '27.165', '27.166', '27.167', '27.168', '27.169', '27.170', '27.171', '27.172', '27.173', '27.174', '27.175', '27.176', '27.177', '27.178', '27.179', '27.180', '27.181', '27.182', '27.183' ] KTip = [ '39.7', '110.70', '175.223', '211.246', '118.235', '110.70', '175.252', '175.253', '175.254', '175.255' ] #'175.252' LGTip = ['61.43', '211.234', '117.111', '211.36', '106.101', '106.102'] # 기존 CSV 불러옴 data = read_csv(filename, dtype={'IPID': str}) cdata = DataFrame() haspostdata = False hascmtdata = False # 처음 csv 데이터가 댓글 데이터인경우 if 'Cmt ID' in data: cdata = read_csv(filename, dtype={'IPID': str}) data = DataFrame() hascmtdata = True else: haspostdata = True if len(sys.argv) > 4 and (sys.argv[1] == "-a" or sys.argv[1] == "--a"): print('2개 이상의 데이터가 발견되었습니다') sys.stdout.flush() savename = sys.argv[len(sys.argv) - 1] for i in range(3, len(sys.argv) - 1): print(i - 1, '번째 파일 병합중...') sys.stdout.flush() fname = sys.argv[i] if not ('.csv' in fname): fname += '.csv' newd = read_csv(fname, dtype={'IPID': str}) if 'Cmt ID' in newd: cdata = concat([cdata, newd]) if not hascmtdata: hascmtdata = True else: data = concat([data, newd]) if not haspostdata: haspostdata = True if not ('.csv' in savename): savename += '.csv' if haspostdata: gonic = data[data['HasAccount'] == 1] # 고닉글과 udong = data[data['HasAccount'] == 0] # 유동글 if hascmtdata: cgonic = cdata[cdata['HasAccount'] == 1] # 고닉댓과 cudong = cdata[cdata['HasAccount'] == 0] # 유동댓 # 새로운 데이터 생성 res = DataFrame(columns=[ 'Nick', 'IPID', 'Posts', 'Upvotes', 'Downvotes', 'Comments', 'Views', 'HasAccount' ]) idList = [] ipList = [] unickList = [] # 모으는 순서: 고닉ID -> 유동IP -> 유동닉 if haspostdata: idList = gonic.IPID.unique().tolist() ipList = udong.IPID.unique().tolist() unickList = udong.Nickname.unique().tolist() if hascmtdata: for ipid in cgonic.IPID.unique().tolist(): if not ipid in idList: idList.append(ipid) for ipid in cudong.IPID.unique().tolist(): if not ipid in ipList: ipList.append(ipid) for unick in cudong.Nickname.unique().tolist(): if not unick in unickList: unickList.append(unick) # 고닉 다중이 목록 불러오기 dup_list_id = [] try: tmp = open('dup_list_id.txt', 'r', encoding='utf-8') dup_list_id = tmp.read().split('\n') tmp.close() except: print('dup_list_id.txt 불러오기 실패') # 유동 다중ip 목록 불러오기 dup_list_ip = [] try: tmp = open('dup_list_ip.txt', 'r', encoding='utf-8') dup_list_ip = tmp.read().split('\n') tmp.close() except: print('dup_list_ip.txt 불러오기 실패') # 유동 다중닉 목록 불러오기 dup_list_nick = [] try: tmp = open('dup_list_nick.txt', 'r', encoding='utf-8') dup_list_nick = tmp.read().split('\n') tmp.close() except: print('dup_list_nick.txt 불러오기 실패') # 고닉 다중이 id목록에서 이미 있던놈은 미리 지우기 for ml in dup_list_id: if not ml == '': # 첫 문자가 #면 주석처리, 아무것도 없을시 무시 if not ml[0] == '#': # 맨 처음 값은 무시 rmlist = ml[ml.find('\t') + 1:] idList = [e for e in idList if e not in rmlist.split('\t')] idList.append(rmlist) # 유동 다중ip 목록에서 이미 있던놈은 미리 지우기 for ml in dup_list_ip: if not ml == '': # 첫 문자가 #면 주석처리, 아무것도 없을시 무시 if not ml[0] == '#': # 맨 처음 값은 무시 rmlist = ml[ml.find('\t') + 1:] ipList = [e for e in ipList if e not in rmlist.split('\t')] ipList.append(rmlist) # 유동 다중닉 목록에서 이미 있던놈은 미리 지우기 for ml in dup_list_nick: if not ml == '': # 첫 문자가 #면 주석처리, 아무것도 없을시 무시 if not ml[0] == '#': # 맨 처음 값은 무시 rmlist = ml[ml.find('\t') + 1:] unickList = [ e for e in unickList if e not in rmlist.split('\t') ] unickList.append(rmlist) print('고닉 글 집계중...') sys.stdout.flush() ################### 고닉 글 집계 ################### for ids in idList: # 글 검색 col = DataFrame() if haspostdata: col = gonic[gonic['IPID'].isin(ids.split('\t'))] col2 = udong[udong['IPID'].isin(ids.split('\t'))] # 유동 아이피도 넣기 col = concat([col, col2]) if hascmtdata: # 댓글 검색 ccol = cgonic[cgonic['IPID'].isin(ids.split('\t'))] ccol2 = cudong[cudong['IPID'].isin(ids.split('\t'))] # 유동 아이피도 넣기 ccol = concat([ccol, ccol2]) nicks = [] ids2 = [] if haspostdata: nicks = col.Nickname.unique().tolist() ids2 = col.IPID.unique().tolist() if hascmtdata: # 댓글 검색에서 나온 다중닉을 기존닉목록에 추가 for cn in ccol.Nickname.unique().tolist(): if not cn in nicks: nicks.append(cn) # 댓글 검색에서 나온 다중ID(IP)를 기존ID(IP)목록에 추가 for cid in ccol.IPID.unique().tolist(): if not cid in ids2: ids2.append(cid) nicks = ' '.join(nicks) ids2 = ' '.join(ids2) if not haspostdata: counts = None else: counts = col.shape[0] # 글 수 if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum() # 추천수 else: upvotes = None if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum() # 비추수 else: downvotes = None if 'Views' in data.columns: views = col.Views.sum() # 조회수 else: views = None if hascmtdata: comments = len(ccol) + len(ccol2) # 쓴댓글수 else: comments = None nd = { 'Nick': nicks, 'IPID': ids2, 'Posts': counts, 'Upvotes': upvotes, 'Downvotes': downvotes, 'Comments': comments, 'Views': views, 'HasAccount': 1 } res = res.append(nd, ignore_index=True) if haspostdata: udong = udong.drop( col2.index) # 조건에 맞게 쓴 글들은 없애기 -> 또 세지 않도록 (이건 고닉의 집피유동 제거) if hascmtdata: cudong = cudong.drop(ccol2.index) print('유동 글 집계중...') sys.stdout.flush() ################### 유동 글 집계 ################### teltype = 0 # 닉네임이 ㅇㅇ이고 통피인놈들을 일단 묶어서 통계내기 for ips in [SKTip, KTip, LGTip]: col = DataFrame() if haspostdata: col = udong[(udong['IPID'].isin(ips)) & (udong['Nickname'] == 'ㅇㅇ')] if hascmtdata: ccol = cudong[(cudong['IPID'].isin(ips)) & (cudong['Nickname'] == 'ㅇㅇ')] #if col.shape[0] > 0: nicks = 'ㅇㅇ' if (teltype == 0): nicks += '(SK통피)' elif (teltype == 1): nicks += '(KT통피)' elif (teltype == 2): nicks += '(U+통피)' else: nicks += '(기타통피)' teltype += 1 if haspostdata: ids2 = col.IPID.unique().tolist() else: ids2 = [] if hascmtdata: # 댓글 검색에서 나온 통피IP를 기존IP목록에 추가 for cid in ccol.IPID.unique().tolist(): if not cid in ids2: ids2.append(cid) ids2 = ' '.join(ids2) if not haspostdata: counts = None else: counts = col.shape[0] # 글 수 if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum() # 추천수 else: upvotes = None if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum() # 비추수 else: downvotes = None if 'Views' in data.columns: views = col.Views.sum() # 조회수 else: views = None if hascmtdata: comments = len(ccol) else: comments = None nd = { 'Nick': nicks, 'IPID': ids2, 'Posts': counts, 'Upvotes': upvotes, 'Downvotes': downvotes, 'Comments': comments, 'Views': views, 'HasAccount': 0 } res = res.append(nd, ignore_index=True) if haspostdata: udong = udong.drop(col.index) # 조건에 맞게 쓴 글들은 없애기 -> 또 세지 않도록 if hascmtdata: cudong = cudong.drop(ccol.index) # ㅇㅇ(123.45), ㅇㅇ(56.789) -> 다른 놈으로 취급 (단, ip가 리스트에 있으면 같은놈) # 파이썬(123.45), 루비(123.45) -> 다른 놈으로 취급 (단, 닉네임이 리스트에 있으면 같은놈) # ip가 다른 ㅇㅇ닉글들 수집 for ips in ipList: col = DataFrame() if haspostdata: col = udong[(udong['IPID'].isin(ips.split('\t'))) & (udong['Nickname'] == 'ㅇㅇ')] if hascmtdata: ccol = cudong[(cudong['IPID'].isin(ips.split('\t'))) & (cudong['Nickname'] == 'ㅇㅇ')] #if col.shape[0] > 0: nicks = 'ㅇㅇ' if haspostdata: ids2 = col.IPID.unique().tolist() else: ids2 = [] if hascmtdata: # 댓글 검색에서 나온 통피IP를 기존IP목록에 추가 for cid in ccol.IPID.unique().tolist(): if not cid in ids2: ids2.append(cid) ids2 = ' '.join(ids2) if not haspostdata: counts = None else: counts = col.shape[0] # 글 수 if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum() # 추천수 else: upvotes = None if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum() # 비추수 else: downvotes = None if 'Views' in data.columns: views = col.Views.sum() # 조회수 else: views = None if hascmtdata: comments = len(ccol) else: comments = None nd = { 'Nick': nicks, 'IPID': ids2, 'Posts': counts, 'Upvotes': upvotes, 'Downvotes': downvotes, 'Comments': comments, 'Views': views, 'HasAccount': 0 } res = res.append(nd, ignore_index=True) if haspostdata: udong = udong.drop(col.index) if hascmtdata: cudong = cudong.drop(ccol.index) print('닉유동 글 집계중...') sys.stdout.flush() # 닉네임이 ㅇㅇ가 아닌 유동닉글들 수집 for nicks in unickList: col = DataFrame() if haspostdata: col = udong[udong['Nickname'].isin(nicks.split('\t'))] if hascmtdata: ccol = cudong[cudong['Nickname'].isin(nicks.split('\t'))] #if col.shape[0] > 0: ##nicks = ' '.join(col.Nickname.unique().tolist()) ##ids2 = ' '.join(col.IPID.unique().tolist()) nicks = [] ids2 = [] if haspostdata: nicks = col.Nickname.unique().tolist() ids2 = col.IPID.unique().tolist() if hascmtdata: # 댓글 검색에서 나온 다중닉을 기존닉목록에 추가 for cn in ccol.Nickname.unique().tolist(): if not cn in nicks: nicks.append(cn) # 댓글 검색에서 나온 다중ID(IP)를 기존ID(IP)목록에 추가 for cid in ccol.IPID.unique().tolist(): if not cid in ids2: ids2.append(cid) nicks = ' '.join(nicks) ids2 = ' '.join(ids2) if not haspostdata: counts = None else: counts = col.shape[0] # 글 수 if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum() # 추천수 else: upvotes = None if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum() # 비추수 else: downvotes = None if 'Views' in data.columns: views = col.Views.sum() # 조회수 else: views = None if hascmtdata: comments = len(ccol) else: comments = None nd = { 'Nick': nicks, 'IPID': ids2, 'Posts': counts, 'Upvotes': upvotes, 'Downvotes': downvotes, 'Comments': comments, 'Views': views, 'HasAccount': 0 } if haspostdata: res = res.append(nd, ignore_index=True) print('작업 마무리중...') sys.stdout.flush() # 결측치 제거 res = res.dropna(axis=1) res = res.dropna(axis=0) #res = res[res['Posts']!=0] res = res[res['IPID'] != ''] # 정렬 (글싼순으로, 글이 없다면 댓글싼순) if 'Posts' in res: res = res.sort_values(by='Posts', ascending=False) elif 'Comments' in res: res = res.sort_values(by='Comments', ascending=False) # 저장 res.to_csv(savename, encoding='utf-8-sig', index=False) print(savename, '로 저장되었습니다.') sys.stdout.flush()
elif frame.iat[a, 2] < frame.iat[b, 2]: frame.iat[a, 2] -= [2] frame.iat[b, 2] += [2] else: frame.iat[a, 2] += [2] frame.iat[b, 2] -= [2] if frame.iat[a, 3] + frame.iat[b, 3] == 3: if frame.iat[a, 2] < frame.iat[b, 2]: frame.iat[a, 2] -= [1] else: frame.iat[b, 2] -= [1] if frame.iat[a, 3] + frame.iat[b, 3] == 4: frame.iat[a, 2] -= [2] frame.iat[b, 2] -= [2] cgn = frame[frame['life'] < 0] pgn = frame.sort_values(by='life', ascending=False) if len(cgn.index) == 0: pass else: for x, y in cgn.index, pgn.index: frame.iat[x, 4] = pgn.iat[y, 4] frame.iat[x, 2] = 10 c += 1 counter = frame['type'].value_counts() counter.name = c recorder[c] = counter if c == 10: d = 1 recorder.plot() plt.show()
def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) df = DataFrame({"value": value}) labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) df["value_group"] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df["value_group"] expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = r"could not convert string to float|invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): s.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name="value_group") cmp(s.astype("object"), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) tm.assert_series_equal(s.astype("category"), s) tm.assert_series_equal(s.astype(CategoricalDtype()), s) roundtrip_expected = s.cat.set_categories( s.cat.categories.sort_values()).cat.remove_unused_categories() tm.assert_series_equal( s.astype("object").astype("category"), roundtrip_expected) tm.assert_series_equal( s.astype("object").astype(CategoricalDtype()), roundtrip_expected) # invalid conversion (these are NOT a dtype) msg = ("dtype '<class 'pandas.core.arrays.categorical.Categorical'>' " "not understood") for invalid in [ lambda x: x.astype(Categorical), lambda x: x.astype("object").astype(Categorical), ]: with pytest.raises(TypeError, match=msg): invalid(s)
from pandas import DataFrame data = [ ["037730", "3R", 1510], ["036360", "3SOFT", 1790], ["005760", "ACTS", 1185], ] columns = ["종목코드", "종목명", "현재가"] df = DataFrame(data=data, columns=columns) df = df.set_index('종목코드') # 현재가를 기준으로 정렬 df2 = df.sort_values(by='현재가') print(df2)
oldest = titanic_df['Age'].max() fig.set(xlim=(0,oldest)) fig.add_legend() # Drop Null values from the Cabin column using dropna deck = titanic_df['Cabin'].dropna() print deck.head() # Cabin levels are categorized A,B,C,D,E,F,G, so only need first letter levels = [] for level in deck: levels.append(level[0]) # append the first letter print levels cabin_df = DataFrame(levels) cabin_df.columns = ['Cabin'] sns.factorplot(x='Cabin',kind='count',data=cabin_df.sort_values(by='Cabin',ascending=1),palette='winter_d') # but produces a T cabin... # Creating a new table dropping cases containing specific values cabin_df = cabin_df[cabin_df.Cabin != 'T'] sns.factorplot(x='Cabin',kind='count',data=cabin_df.sort_values(by='Cabin',ascending=1),palette='winter_d') # but produces a T cabin... # Factorplot to see how two categorical variables are related sns.factorplot(x='Embarked',kind='count',data=titanic_df.sort_values(by='Pclass'),hue='Pclass') # Calculate values in put in a new column (whether alone or with family) based on existing column values def alone_or_fam(passenger): sib, par = passenger if sib==0 and par==0: with_fam = 'Alone' else:
#get the contigs that have no SNPs on them print('find unreped contigs') unreped_contigs = contig_dat[~contig_dat['Contig'].isin(snp_df['Contig'])] #print non represented contigs to a file print('writing unreped contigs') unreped_contigs.to_csv('contigs_with_no_snps.tsv', sep='\t', index=False) #get contigs with representitives reped_contigs = contig_dat[contig_dat['Contig'].isin(snp_df['Contig'])] #build the dictonary of snp locations contig_hit_dict = snp_pos_dictonary(reped_contigs, snp_df) #locate the gaps in the contigs #there are 10,432 regions. print('scanning for gaps') gaps_in_contigs = find_gaps(contig_hit_dict, reped_contigs) gaps_dataframe = DataFrame(gaps_in_contigs, columns = ['Contig', 'leading_position', 'trailing_position']) #get size data for the contigs gaps_dataframe = gaps_dataframe.merge(contig_dat) #reorder columns gaps_dataframe = gaps_dataframe[['Contig', 'Size', 'Size_rank', 'leading_position', 'trailing_position']] gaps_dataframe.sort_values(['Size_rank','leading_position'], inplace=True) gaps_dataframe.to_csv('locations_of_gaps_in_coverage.tsv', sep='\t', index=False)
def f1(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1)
def aggregate_aligned_column_sims( aggsim: DataFrame, tableid_colids: Dict[int, Set[int]], align_columns: str = "greedy", align_width_norm: str = "jacc", align_use_total_width: bool = True, ) -> DataFrame: """Aggregate column similarities. To create a table similarity graph, the column similarities need to be aggregated. This aggregation must be based on several assumptions which influence the accuracy and speed. First of all, how to align columns. Do you allow multiple columns from one table to align with a single column in the other? In that case, choose one of the fast 'max' values for the ``align`` parameter, depending on whether to allow the first or the second table to match multiple columns in the other. Otherwise, choose 'greedy'. This calculates a kind of soft-jaccard score. In that case, you'll need to decide how to handle columns for which no similarity score could be calculated. To ignore those columns, set ``align_use_total_width=False``. Otherwise, they will be assumed to be non-matching. Also, the alignment score is then normalized. This expresses your view about whether you want wide and narrow tables to match. If so, choose 'wide'. If you want the tables to have the similar widths, choose 'narrow'. For a middle ground, choose 'jacc', which will calculate ``score / (cols1 + cols2 - score)``. Args: aggsim: Column similarities (aggregated match scores) tableid_colids: Global column IDs per table ID align_columns ({'max1', 'max2', 'greedy'}): Column alignment method. Defaults to 'greedy'. align_width_norm ({'wide', 'narrow', 'jacc'}): Table width difference normalisation method. Defaults to 'jacc'. align_use_total_width: Whether to use total table width. Defaults to True. Returns: Table similarities """ assert align_columns in {"max1", "max2", "greedy"} assert align_width_norm in {"wide", "narrow", "jacc"} def agg(gs, align): try: # Maybe show progress if log.getLogger().level <= log.INFO: import warnings, tqdm with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) tqdm.tqdm.pandas(desc="Aggregating column scores") return gs.progress_aggregate(align) except Exception as e: log.debug(f"When trying to show aggregation progress, {e}") return gs.agg(align) if align_columns == "greedy": # Compute soft column alignment jaccard aggsim.sort_values(ascending=False, inplace=True) total = agg(aggsim.groupby(level=[0, 1]), greedy_align) if align_use_total_width: # Use total column widths table_numcols = pd.Series( {ti: len(cis) for ti, cis in tableid_colids.items()}) j = (pd.DataFrame({ "total": total }).join(table_numcols.rename("n1"), on="ti1").join(table_numcols.rename("n2"), on="ti2")) else: # Only use number of matched columns n1 = aggsim.groupby(level=[0, 1, 2]).count().groupby( level=[0, 1]).first() n2 = aggsim.groupby(level=[0, 1, 3]).count().groupby( level=[0, 1]).first() j = pd.DataFrame({"total": total, "n1": n1, "n2": n2}) # if align_width_norm == "jacc": return j["total"] / (j["n1"] + j["n2"] - j["total"]) elif align_width_norm == "wide": return j["total"] / j[["n1", "n2"]].max(1) elif align_width_norm == "narrow": return j["total"] / j[["n1", "n2"]].min(1) else: level = 2 if align_columns == "max1" else 3 return aggsim.groupby(level=[0, 1, level]).max().groupby( level=[0, 1]).mean()
'hits': [], '0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': [] } for line in fread.readlines(): line = line.strip().split("\t") d_df['species'].append(d[str(line[0])]) d_df['hits'].append(int(line[1])) id_list = [str(k) for k in range(10)] flag = 0 for i in range(2, len(line), 2): id = str(line[i]) num = int(line[i + 1]) id_list.remove(id) d_df[str(id)].append(num) flag += 1 for j in id_list: d_df[j].append(0) flag += 1 df = DataFrame(d_df) sorted = df.sort_values(by="hits", ascending=False) print(sorted.head(20))
def show_model_options(model): running = True while running: print( '--------------------------------------------------------------------------' ) print(model.estimator_name) print( '--------------------------------------------------------------------------' ) print('0. Show performance') print('1. Show residual plot') print('2. Train this model again') print('3. Brute force combination of features') print('4. Display feature importance') print('5. Predict housing price using custom features') print('6. Back') print( '--------------------------------------------------------------------------' ) user_input = input('Enter your choice: ') if user_input == '6': running = False # Show Performance. elif user_input == '0' and user_input.isdigit: show_model_performance(model) # Show Residual Plot. elif user_input == '1': # Predict prices with data used for training. predicted_training_target = model.predict(training_features) # Predict prices with new and unseen data. predicted_testing_target = model.predict(testing_features) # Generate and show residual plot. plot_residual(predicted_training_target=predicted_training_target, actual_training_target=training_target, predicted_testing_target=predicted_testing_target, actual_testing_target=testing_target, model_name=model.estimator_name) # Train again. elif user_input == '2' and user_input.isdigit: print('Training {} ...'.format(model.estimator_name)) model.train_and_evaluate(features=training_features, target=training_target, kfold=True) print('Training completed!') model.save() print('Model persisted.') elif user_input == '3': print('searching for best combination of features...') best_mse, best_feature_lists = find_best_features(model) print('Best MSE:', best_mse, 'using features:', best_feature_lists) elif user_input == '4': df = DataFrame() df['FEATURE'] = dataset.column_names[0:-1] if model.estimator_name in [ 'Elastic Net', 'LARS', 'Lasso', 'Ridge', 'Linear', ]: df['IMPORTANCE'] = model.estimator.coef_ print(df.sort_values(by='IMPORTANCE', ascending=False)) elif model.estimator_name in [ 'Gradient Boosting', 'Random Forest', 'Extra Trees' ]: df['IMPORTANCE'] = model.estimator.feature_importances_ print(df.sort_values(by='IMPORTANCE', ascending=False)) elif model.estimator_name in ['SVM RBF']: print(model.estimator.dual_coef_) elif user_input == '5': predict_custom(model)
def prep_data(data: pd.DataFrame) -> pd.DataFrame: data['abs_bias'] = np.sqrt(data['bias']**2) data = data.sort_values(by='track') return data[['track', 'bias', 'abs_bias', 'review_length']]
def scoring_trend_analysis(self, flag): choose = flag with open(r'./用户影评相关数据/' + self.filmname + '用户影评相关信息.json', 'r', encoding='UTF-8') as f: t1 = json.load(f, strict=False) if choose == '1': self.textBrowser.append("开始生成" + self.filmname + "的评论推荐度与日期分析柱状图......") QApplication.processEvents() if choose == '2': self.textBrowser.append("开始生成" + self.filmname + "的评论推荐度与日期分析折线图......") QApplication.processEvents() if choose == '3': self.textBrowser.append("开始生成" + self.filmname + "的评论推荐度与日期分析河状图......") QApplication.processEvents() # 取出里面的评分数据 score, date, val, command_date_list = [], [], [], [] result = {} for each in t1: command_date_list.append((each['用户推荐度'], each['用户评论时间'])) # 数出各个日期各个得分的数量 for i in set(list(command_date_list)): result[i] = command_date_list.count(i) # dict类型 info = [] # 将计数好的数据重新打包 for key in result: score = key[0] date = key[1] val = result[key] info.append([score, date, val]) info_new = DataFrame(info) # 将字典转换成为数据框 info_new.columns = ['score', 'date', 'votes'] # 按日期升序排列df info_new.sort_values('date', inplace=True) # 插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值 mark = 0 creat_df = pd.DataFrame(columns=['score', 'date', 'votes']) # 创建空的dataframe for i in list(info_new['date']): location = info_new[(info_new.date == i) & (info_new.score == "力荐")].index.tolist() if location == []: creat_df.loc[mark] = ["力荐", i, 0] mark += 1 location = info_new[(info_new.date == i) & (info_new.score == "推荐")].index.tolist() if location == []: creat_df.loc[mark] = ["推荐", i, 0] mark += 1 location = info_new[(info_new.date == i) & (info_new.score == "还行")].index.tolist() if location == []: creat_df.loc[mark] = ["还行", i, 0] mark += 1 location = info_new[(info_new.date == i) & (info_new.score == "较差")].index.tolist() if location == []: creat_df.loc[mark] = ["较差", i, 0] mark += 1 location = info_new[(info_new.date == i) & (info_new.score == "很差")].index.tolist() if location == []: creat_df.loc[mark] = ["很差", i, 0] mark += 1 info_new = info_new.append(creat_df.drop_duplicates(), ignore_index=True) command_date_list = [] info_new.sort_values('date', inplace=True) # 按日期升序排列df,便于找最早date和最晚data,方便后面插值 for index, row in info_new.iterrows(): command_date_list.append([row['date'], row['votes'], row['score']]) attr, v1, v2, v3, v4, v5 = [], [], [], [], [], [] attr = list(sorted(set(info_new['date']))) for i in attr: v1.append( int(info_new[(info_new['date'] == i) & (info_new['score'] == "力荐")]['votes'])) v2.append( int(info_new[(info_new['date'] == i) & (info_new['score'] == "推荐")]['votes'])) v3.append( int(info_new[(info_new['date'] == i) & (info_new['score'] == "还行")]['votes'])) v4.append( int(info_new[(info_new['date'] == i) & (info_new['score'] == "较差")]['votes'])) v5.append( int(info_new[(info_new['date'] == i) & (info_new['score'] == "很差")]['votes'])) # 柱状图 if choose == '1': c = (Bar( init_opts=opts.InitOpts(width="665px", height="500px") ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis( "推荐", v2, stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis( "较差", v4, stack="stack1").add_yaxis( "很差", v5, stack="stack1").reversal_axis().set_series_opts( label_opts=opts.LabelOpts( is_show=False)).set_global_opts( tooltip_opts=opts.TooltipOpts( is_show=True), toolbox_opts=opts.ToolboxOpts( is_show=True, pos_right="30%", ), title_opts=opts.TitleOpts( title="用户评论推荐度柱状图"), datazoom_opts=opts.DataZoomOpts( type_="inside", range_start=0, range_end=100), ).render("./爬虫数据关联可视化/" + self.filmname + "影评可视化数据/bar_reversal_axis.html")) QApplication.processEvents() self.comment_columnar_pic = (Bar( init_opts=opts.InitOpts(width="665px", height="500px") ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis( "推荐", v2, stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis( "较差", v4, stack="stack1").add_yaxis( "很差", v5, stack="stack1").reversal_axis().set_series_opts( label_opts=opts.LabelOpts( is_show=False)).set_global_opts( tooltip_opts=opts.TooltipOpts( is_show=True), toolbox_opts=opts.ToolboxOpts( is_show=True, pos_right="30%", ), title_opts=opts.TitleOpts( title="用户评论推荐度柱状图"), datazoom_opts=opts.DataZoomOpts( type_="inside", range_start=0, range_end=100), )) self.saveflag = '4' self.textBrowser.append("开始生成" + self.filmname + "的评论推荐度与日期分析柱状图完成!") QApplication.processEvents() self.show_scoring_trend_analysis_columnar() QApplication.processEvents() # 折线图 if choose == '2': polyline = (Line( init_opts=opts.InitOpts(width="665px", height="500px") ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis( "推荐", v2, stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis( "较差", v4, stack="stack1").add_yaxis( "很差", v5, stack="stack1").set_global_opts( tooltip_opts=opts.TooltipOpts(is_show=True), toolbox_opts=opts.ToolboxOpts( is_show=True, pos_right="30%", ), title_opts=opts.TitleOpts(title="用户评论推荐度折线图"), datazoom_opts=opts.DataZoomOpts(type_="inside", range_start=0, range_end=100), ).render("./爬虫数据关联可视化/" + self.filmname + "影评可视化数据/line_markpoint.html")) QApplication.processEvents() self.comment_polyline_pic = (Line( init_opts=opts.InitOpts(width="665px", height="500px") ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis( "推荐", v2, stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis( "较差", v4, stack="stack1").add_yaxis( "很差", v5, stack="stack1").set_global_opts( tooltip_opts=opts.TooltipOpts(is_show=True), toolbox_opts=opts.ToolboxOpts( is_show=True, pos_right="30%", ), title_opts=opts.TitleOpts(title="用户评论推荐度折线图"), datazoom_opts=opts.DataZoomOpts(type_="inside", range_start=0, range_end=100), )) self.saveflag = '5' self.textBrowser.append(self.filmname + "的评论推荐度与日期分析折线图完成!") QApplication.processEvents() self.show_scoring_trend_analysis_polyline() QApplication.processEvents() # 河流图 if choose == '3': river = (ThemeRiver( init_opts=opts.InitOpts(width="665px", height="500px")).add( series_name=['力荐', '推荐', '还行', '较差', '很差'], data=command_date_list, singleaxis_opts=opts.SingleAxisOpts(pos_top="50", pos_bottom="50", type_="time"), ).set_global_opts( tooltip_opts=opts.TooltipOpts(is_show=True, trigger="axis", axis_pointer_type="line"), toolbox_opts=opts.ToolboxOpts( is_show=True, pos_right="30%", ), title_opts=opts.TitleOpts(title="推荐度河流图"), datazoom_opts=opts.DataZoomOpts(type_="inside", range_start=0, range_end=100), ).render("./爬虫数据关联可视化/" + self.filmname + "影评可视化数据/theme_river.html")) QApplication.processEvents() self.comment_river_pic = (ThemeRiver( init_opts=opts.InitOpts(width="665px", height="500px")).add( series_name=['力荐', '推荐', '还行', '较差', '很差'], data=command_date_list, singleaxis_opts=opts.SingleAxisOpts(pos_top="50", pos_bottom="50", type_="time"), ).set_global_opts( tooltip_opts=opts.TooltipOpts(is_show=True, trigger="axis", axis_pointer_type="line"), toolbox_opts=opts.ToolboxOpts( is_show=True, pos_right="30%", ), title_opts=opts.TitleOpts(title="推荐度河流图"), datazoom_opts=opts.DataZoomOpts(type_="inside", range_start=0, range_end=100), )) self.saveflag = '6' self.textBrowser.append(self.filmname + "的评论推荐度与日期分析河状图完成!") QApplication.processEvents() self.show_scoring_trend_analysis_river() QApplication.processEvents()
def plus_period(self, dfp: pd.DataFrame, s_date, e_date, pr_cloc: str, sign: str): # 获取dfp的列名 list_cl = list(dfp) # 去除需要合并列的列名 list_cl.remove(s_date) list_cl.remove(e_date) list_cl.remove(pr_cloc) # 按照去除合并列名进行排序 dfp.sort_values(list_cl, inplace=True) # 按照排序后行,重新设置index dfp = dfp.reset_index(drop=True) # 复制dfp,用以处理 dfpc = dfp.copy() # 去除合并列 dfpc.drop([s_date, e_date, pr_cloc], axis=1, inplace=True) # 进行查重处理 list_dp = dfpc.duplicated() # 查找重复分界点 x = list_dp[list_dp.isin([False])].index # 因为没有找到index插入的方法,将分界点index转为list list_x = [] for q in range(len(x)): list_x.append(x[q]) # 主要用于加入最后一条记录index list_x.append(len(dfp)) # print(list_x) # x.append(int64(len(dfp))) yn = [] # 循环获取重复记录段数据 for i in range(len(list_x) - 1): # 判断是否有需要合并项 if (list_x[i + 1] - list_x[i]) > 1: # 若有序号间隔大于1,则进入循环 for j in range(list_x[i + 1] - list_x[i]): # 取出需要合并数据,形成list yn.append(dfp.loc[list_x[i] + j, pr_cloc]) # 将list合并成以sign为分隔字符串。 y = sign.join(yn) # 排序 dfp_d = dfp.loc[list_x[i]:list_x[i + 1] - 1, :].copy() dfp_d.sort_values(s_date, inplace=True) dfp_d.reset_index(drop=True, inplace=True) # 将字符串赋给dfp第一列 dfp.loc[list_x[i], pr_cloc] = y dfp.loc[list_x[i], s_date] = dfp_d.loc[0, s_date] dfp.loc[list_x[i], e_date] = dfp_d.loc[list_x[i + 1] - list_x[i] - 1, e_date] # 删除多余项目 for k in range(list_x[i + 1] - list_x[i] - 1): dfp.drop(list_x[i] + 1 + k, axis=0, inplace=True) # 清空记录list yn = [] # 重置index dfp = dfp.reset_index(drop=True) return dfp
columns=list('dabc')) print(frame.sort_index()) # print(frame.sort_index(axis=0)) # equivalent as above print(frame.sort_index(axis=1)) print(frame.sort_index(axis=1, ascending=False)) print() print("## Sort by value of Series:") obj = Series([4, 7, -3, 2]) print(obj.sort_values()) print() print("## Sort by columns of DataFrame:") frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) print(frame) print(frame.sort_values(by='b')) print(frame.sort_values(by=['a', 'b'])) print() print("## rank(), compute numerical data ranks, start from 1:") obj = Series([7, -5, 7, 4, 2, 0, 4]) print("obj.rank(axis='index'):") print(obj.rank(axis='index')) # default: axis=0, axis='index' print("""obj.rank(method='first')), ranks assigned in order they appear in the array:""") print(obj.rank(method='first')) print("obj.rank(method='min'):") print(obj.rank(method='min')) print("obj.rank(ascending=False, method='max'):") print(obj.rank(ascending=False, method='max')) print()
def run(date, date1): stock = StockPool(date1).select_stock() df = DataFrame() df['cash_net_oper_act'] = FactorsZoo.CashNetOperAct( date, stock, label='cash_net_oper_act').get_data() df['deductedprofit'] = FactorsZoo.DeductedProfit( date, stock, 'deductedprofit').get_data() df['dividend'] = FactorsZoo.Dividend(date, stock, 'dividend').get_data() df['industry'] = FactorsZoo.Industry(date, stock, 'industry').get_data() df['net_inc'] = FactorsZoo.NetInc(date, stock, 'net_inc').get_data() df['pct'] = FactorsZoo.Pct(date, stock, 'pct', -6).get_data() df['pe'] = FactorsZoo.Pe(date, stock, 'pe').get_data() df['size'] = FactorsZoo.Size(date, stock, 'size').get_data() df['turn_per'] = FactorsZoo.TurnPer(date, stock, 'turn_per', -6).get_data() df['volitality'] = FactorsZoo.Volitality(date, stock, 'volitality', -6).get_data() df['vol_per'] = FactorsZoo.VolPer(date, stock, 'vol_per', -6).get_data() df['yoyprofit'] = FactorsZoo.Yoyprofit(date, stock, 'yoyprofit').get_data() df['yoytr'] = FactorsZoo.YoyTr(date, stock, 'yoytr').get_data() # 因子中性化 factors = df.columns.tolist() df['Codes'] = stock df = df.dropna() stock = df['Codes'].values.tolist() df = df[factors] fp = FactorProcess() df = fp.neutralize_factor(df, factors) alpha = fp.get_alpha(stock, date, -6) df['alpha'] = alpha coef_ = fp.calac_beta(df['alpha'], df[factors], factors) df1 = DataFrame() df1['cash_net_oper_act'] = FactorsZoo.CashNetOperAct( date1, stock, label='cash_net_oper_act').get_data() df1['deductedprofit'] = FactorsZoo.DeductedProfit( date1, stock, 'deductedprofit').get_data() df1['dividend'] = FactorsZoo.Dividend(date1, stock, 'dividend').get_data() df1['industry'] = FactorsZoo.Industry(date1, stock, 'industry').get_data() df1['net_inc'] = FactorsZoo.NetInc(date1, stock, 'net_inc').get_data() df1['pct'] = FactorsZoo.Pct(date1, stock, 'pct', -6).get_data() df1['pe'] = FactorsZoo.Pe(date1, stock, 'pe').get_data() df1['size'] = FactorsZoo.Size(date1, stock, 'size').get_data() df1['turn_per'] = FactorsZoo.TurnPer(date1, stock, 'turn_per', -6).get_data() df1['volitality'] = FactorsZoo.Volitality(date1, stock, 'volitality', -6).get_data() df1['vol_per'] = FactorsZoo.VolPer(date1, stock, 'vol_per', -6).get_data() df1['yoyprofit'] = FactorsZoo.Yoyprofit(date1, stock, 'yoyprofit').get_data() df1['yoytr'] = FactorsZoo.YoyTr(date1, stock, 'yoytr').get_data() factors = df1.columns.tolist() df1['Codes'] = stock df1 = df1.dropna() stock = df1['Codes'].values.tolist() df1 = df1[factors] df1 = fp.neutralize_factor(df1, factors) alpha = fp.forcast_alpha(coef_, df1, factors) df1['Codes'] = stock df1['alpha'] = alpha df1 = df1.sort_values(['alpha'], ascending=False).head(30) return df1