def test_frame_values_with_tz(self): tz = "US/Central" df = DataFrame({"A": date_range('2000', periods=4, tz=tz)}) result = df.values expected = np.array([ [pd.Timestamp('2000-01-01', tz=tz)], [pd.Timestamp('2000-01-02', tz=tz)], [pd.Timestamp('2000-01-03', tz=tz)], [pd.Timestamp('2000-01-04', tz=tz)], ]) tm.assert_numpy_array_equal(result, expected) # two columns, homogenous df = df.assign(B=df.A) result = df.values expected = np.concatenate([expected, expected], axis=1) tm.assert_numpy_array_equal(result, expected) # three columns, heterogenous est = "US/Eastern" df = df.assign(C=df.A.dt.tz_convert(est)) new = np.array([ [pd.Timestamp('2000-01-01T01:00:00', tz=est)], [pd.Timestamp('2000-01-02T01:00:00', tz=est)], [pd.Timestamp('2000-01-03T01:00:00', tz=est)], [pd.Timestamp('2000-01-04T01:00:00', tz=est)], ]) expected = np.concatenate([expected, new], axis=1) result = df.values tm.assert_numpy_array_equal(result, expected)
def test_assign_bad(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) # non-keyword argument with pytest.raises(TypeError): df.assign(lambda x: x.A) with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C)
def test_assign_alphabetical(self): # GH 9818 df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) result = df.assign(D=df.A + df.B, C=df.A - df.B) expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list('ABCD')) assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) assert_frame_equal(result, expected)
def test_assign_dependent_old_python(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) # Key C does not exist at definition time of df with pytest.raises(KeyError): df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) with pytest.raises(KeyError): df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
def test_assign_dependent(self): df = DataFrame({'A': [1, 2], 'B': [3, 4]}) result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list('ABCD')) assert_frame_equal(result, expected) result = df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list('ABCD')) assert_frame_equal(result, expected)
def subtitle_cat(train_df: pd.DataFrame, clue_word: list): # サブタイトル名をもとにカテゴリ変数を作成する df = train_df.assign(heading_cat = np.nan) df.loc[df.heading.str.contains(r'NO_SUBTITLE'), 'heading_cat'] = 0 df.loc[df.heading.str.contains(util.contains_patt(clue_word)), 'heading_cat'] = 1 df.loc[df.heading_cat.isna(), 'heading_cat'] = 2 return df.heading_cat.astype('category')
def test_assign_order(self): # GH 9818 df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) result = df.assign(D=df.A + df.B, C=df.A - df.B) if PY36: expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list('ABDC')) else: expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list('ABCD')) assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list('ABCD')) assert_frame_equal(result, expected)
def test_assign_bad(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) # non-keyword argument with tm.assertRaises(TypeError): df.assign(lambda x: x.A) with tm.assertRaises(AttributeError): df.assign(C=df.A, D=df.A + df.C) with tm.assertRaises(KeyError): df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) with tm.assertRaises(KeyError): df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
def labeling(sentence_df: pd.DataFrame, train_dict: dict): _sentence_df = sentence_df.assign(label = False) for _id, train_values in train_dict.items(): if len(train_values) is 0: continue _sentence_df.loc[_sentence_df._id == str(_id), 'label'] = \ _sentence_df.loc[_sentence_df._id == str(_id)].sentence.str.contains(contains_patt(train_values)) return _sentence_df
class MergeCategoricals(object): def setup(self): self.left_object = DataFrame( {'X': np.random.choice(range(0, 10), size=(10000,)), 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))}) self.right_object = DataFrame( {'X': np.random.choice(range(0, 10), size=(10000,)), 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))}) self.left_cat = self.left_object.assign( Y=self.left_object['Y'].astype('category')) self.right_cat = self.right_object.assign( Z=self.right_object['Z'].astype('category')) def time_merge_object(self): merge(self.left_object, self.right_object, on='X') def time_merge_cat(self): merge(self.left_cat, self.right_cat, on='X')
def test_assign(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) original = df.copy() result = df.assign(C=df.B / df.A) expected = df.copy() expected['C'] = [4, 2.5, 2] assert_frame_equal(result, expected) # lambda syntax result = df.assign(C=lambda x: x.B / x.A) assert_frame_equal(result, expected) # original is unmodified assert_frame_equal(df, original) # Non-Series array-like result = df.assign(C=[4, 2.5, 2]) assert_frame_equal(result, expected) # original is unmodified assert_frame_equal(df, original) result = df.assign(B=df.B / df.A) expected = expected.drop('B', axis=1).rename(columns={'C': 'B'}) assert_frame_equal(result, expected) # overwrite result = df.assign(A=df.A + df.B) expected = df.copy() expected['A'] = [5, 7, 9] assert_frame_equal(result, expected) # lambda result = df.assign(A=lambda x: x.A + x.B) assert_frame_equal(result, expected)
def fix_tickets( self, ticket_frame: pd.DataFrame, path_fixes) -> pd.DataFrame: ticket_frame.rename( columns={'Total changed lines': 'ChangedLines'}, inplace=True) ticket_frame = ticket_frame[ ticket_frame.ChangedLines < 100000] ticket_frame = ticket_frame.assign( ChangedFiles=ticket_frame['Changed files'].apply( partial(self.fix_path_prefixes, path_fixes))) fixed_frame = ticket_frame.drop( 'Changed files', axis=1).sort_values( by='CommitDate').reset_index(drop=True) fixed_frame.fillna(value={'Found': ''}, axis=0, inplace=True) return fixed_frame
def get_subtitle(sentence_df: pd.DataFrame, wiki_dump_data: list): df = sentence_df.assign(heading = '') new_train_df = pd.DataFrame() for _id in df._id.unique(): article_df = df.loc[df._id == _id] row_article = [entry for entry in wiki_dump_data if entry['index']['_id'] == _id][0] parsed = wtp.parse(row_article['source_text']) for source in parsed.sections[1:]: heading = _search_subtitle(source.string) section_text = _clean_source_text(source) article_df = _get_subtitle_of_sentence(article_df, section_text, heading) article_df = _complement_subtitle(article_df) new_train_df = new_train_df.append(article_df) return new_train_df
def pipe_age_metadata(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign( date=clean_date_series(df.Date), location=self.location, )
def add_totals(df: pd.DataFrame) -> pd.DataFrame: return df.assign(total_vaccinations=df.people_fully_vaccinated + df.people_vaccinated)
def enrich_location(df: pd.DataFrame) -> pd.DataFrame: return df.assign(location="Estonia")
# m.to_csv(open('Data/X.csv', 'w')) # dataset = dataset1 import traceback dataset1 = dataset.select_dtypes(include=[np.number]) use_field = list(dataset1.columns.values) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(dataset1) dataset1 = imp.transform(dataset1) dataset1 = DataFrame(dataset1, columns=use_field) dataset1 = concat_free_money(dataset1) dataset1 = throw_outliers(dataset1) dataset1 = dataset1.assign(SUBS_ID=dataset.SUBS_ID) dataset1 = pd.merge(dataset1, dataset2, on='SUBS_ID', how='left') dataset1 = pd.merge(dataset1, read_csv('Data/X2.csv'), on='SUBS_ID', how='left') # print(use_field) gr = dataset1.groupby('SUBS_ID') dataset = gr.mean() dataset1 = dataset.copy() dataset1 = dataset1.drop(['AGE_GROUP1', 'AGE_GROUP2'], axis=1) print(len(dataset1.columns.values)) # dataset1 = dataset1['SUBS_ID'] # (dataset.columns.values[0])) # dataset1 = preprocessing.scale(dataset1) TRAIN_PART = 6/5 train = dataset1[:int(len(dataset1)/TRAIN_PART)]
def enrich_source(df: pd.DataFrame) -> pd.DataFrame: return df.assign(source_url="https://opendata.digilugu.ee")
def pipe_source(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(source_url=self.source_url_ref)
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(Date=clean_date_series(df["Date"], "%d %b %y"))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: df = df.assign( date=df.date.apply(clean_date, fmt="%Y-%m-%d", minus_days=1)) return df
def regression( col1: StContainer, col2: StContainer, learning_data: pd.DataFrame, target: str, test_size: float, stratify: str, ) -> None: """Regrese v dashboardu""" # rozdělení na trénovací a testovací data y = learning_data[target] X = learning_data.drop(columns=[target]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=stratify) with col1.beta_expander("Výběr modelu"): model = st.selectbox("Regresní model", list(REGRESSION_MODELS)) # hodnoty hyperparametrů si uložíme do slovníku typu {jméno hyperparametru: hodnota} hyperparams = { hyperparam: widget() for hyperparam, widget in REGRESSION_MODELS[model] ["hyperparams"].items() } metric = st.selectbox("Metrika", list(METRICS)) # REGRESSION_MODELS[model]["class"] vrací třídu regresoru, např. LinearRegression # ve slovníku hyperparams máme uložené hodnoty hyperparametrů od uživatele # takto tedy můžeme vytvořit příslušný regresor regressor = REGRESSION_MODELS[model]["class"](**hyperparams) # zkusíme natrénovat model try: regressor.fit(X_train, y_train) except Exception as prediction_error: # v případě chyby ukážeme uživateli co se stalo st.error(f"Chyba při fitování modelu: {prediction_error}") # a nebudeme už nic dalšího zobrazovat return # predikce pomocí natrénovaného modelu y_predicted = regressor.predict(X_test) prediction_error = METRICS[metric](y_predicted, y_test) col2.header(f"Výsledek modelu {model}") col2.write(f"{metric}: {prediction_error:.3g}") # vytvoříme pomocný dataframe s se sloupcem s predikcí predicted_target_column = f"{target} - predicted" complete_data = learning_data.assign( **{predicted_target_column: regressor.predict(X)}) # vykreslíme správné vs predikované body fig = px.scatter(complete_data, x=target, y=predicted_target_column) # přidáme čáru ukazující ideální predikci fig.add_trace( go.Scatter( x=[complete_data[target].min(), complete_data[target].max()], y=[complete_data[target].min(), complete_data[target].max()], mode="lines", line=dict(width=2, color="DarkSlateGrey"), name="ideal prediction", )) col2.write(fig)
def format_date(df: pd.DataFrame) -> pd.DataFrame: return df.assign( date=pd.to_datetime(df.date, format="%d-%m-%Y").astype(str))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(date=self._parse_date())
def pipe_metadata(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(location=self.location, source_url=self.source_url_ref)
def pipe_one_dose_correction(self, df: pd.DataFrame) -> pd.DataFrame: single_shot = df.people_fully_vaccinated - df.VacAd2Dose return df.assign(people_vaccinated=df.people_vaccinated + single_shot)
def enrich_columns(df: pd.DataFrame) -> pd.DataFrame: return df.assign(location="Portugal", source_url="https://github.com/dssg-pt/covid19pt-data")
#len(rate_with_go_rmdup_rmna.index) #10314 for go_term in rate_with_go_rmdup_rmna['GO Term Accession'].drop_duplicates(keep='first'): list1 = rate_with_go_rmdup_rmna[rate_with_go_rmdup_rmna['GO Term Accession'] == go_term]['Snake_island_viper_6'] list2 = rate_with_go_rmdup_rmna[rate_with_go_rmdup_rmna['GO Term Accession'] == go_term]['Black_brow_viper_4'] #paired u, pvalue = scipy.stats.wilcoxon(list1,list2) u, pvalue = scipy.stats.mannwhitneyu(list1,list2,alternative='greater') go_u_pvalue.append([go_term,u,pvalue]) #4620 if pvalue < 0.05: go_u_pvalue_sign.append([go_term,u,pvalue]) #7 """ #go_u_pvalue_sign_df['lable'] = Series(['greater']*len(go_u_pvalue_sign_df), index=go_u_pvalue_sign_df.index) go_u_pvalue_sign_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value'] go_u_pvalue_sign_df_label = go_u_pvalue_sign_df.assign(label = Series(['greater']*len(go_u_pvalue_sign_df))) go_u_pvalue_sign_less_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value'] go_u_pvalue_sign_less_df_label = go_u_pvalue_sign_less_df.assign(label = Series(['less']*len(go_u_pvalue_sign_less_df))) go_u_pvalue_sign_gl_df = pd.concat([go_u_pvalue_sign_df_label,go_u_pvalue_sign_less_df_label],axis=0,ignore_index=True) #go_u_pvalue_sign_gl_df_sort = go_u_pvalue_sign_gl_df.sort_values(by='GO Term Accession',ascending=True) """ go_u_pvalue_sign_gl_df = go_u_pvalue_sign_df.append(go_u_pvalue_sign_less_df,ignore_index=True) go_u_pvalue_sign_gl_df = pd.concat([go_u_pvalue_sign_df,go_u_pvalue_sign_less_df],axis=0,ignore_index=True) go_u_pvalue_sign_gl_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value'] go_u_pvalue_sign_gl_df.assign(lable = Series(['greater']*len(go_u_pvalue_sign_df)+['less']*len(go_u_pvalue_sign_less_df))) """ go_rate_with_sign_p = pd.merge(GO_mean_rate.reset_index().ix[:,["GO Term Accession",species_focus,species_backgroud]],go_u_pvalue_sign_gl_df,on="GO Term Accession",how='inner',sort=True) mart_go_uniq_ann = pd.read_table('human_mart_export_GOuniq_sort.txt',header = 'infer',skiprows=[0],skip_blank_lines=True) go_rate_with_sign_p_ann = pd.merge(mart_go_uniq_ann,go_rate_with_sign_p,on="GO Term Accession",how='inner')
def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(vaccine=df.VaccApprov.apply(self._map_vaccines))
def test_assign_multiple(self): df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list('ABCDE')) assert_frame_equal(result, expected)
def format_date(input: pd.DataFrame) -> pd.DataFrame: return input.assign( date=pd.to_datetime(input.date, format="%d/%m/%Y").dt.date)
def enrich_columns(input: pd.DataFrame) -> pd.DataFrame: return input.assign( location="Ecuador", source_url="https://github.com/andrab/ecuacovid", vaccine="Pfizer/BioNTech", )
def pipe_total_vaccinations(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(total_vaccinations=df.dose_1 + df.dose_2 + df.dose_3)