Exemplo n.º 1
0
    def test_frame_values_with_tz(self):
        tz = "US/Central"
        df = DataFrame({"A": date_range('2000', periods=4, tz=tz)})
        result = df.values
        expected = np.array([
            [pd.Timestamp('2000-01-01', tz=tz)],
            [pd.Timestamp('2000-01-02', tz=tz)],
            [pd.Timestamp('2000-01-03', tz=tz)],
            [pd.Timestamp('2000-01-04', tz=tz)],
        ])
        tm.assert_numpy_array_equal(result, expected)

        # two columns, homogenous

        df = df.assign(B=df.A)
        result = df.values
        expected = np.concatenate([expected, expected], axis=1)
        tm.assert_numpy_array_equal(result, expected)

        # three columns, heterogenous
        est = "US/Eastern"
        df = df.assign(C=df.A.dt.tz_convert(est))

        new = np.array([
            [pd.Timestamp('2000-01-01T01:00:00', tz=est)],
            [pd.Timestamp('2000-01-02T01:00:00', tz=est)],
            [pd.Timestamp('2000-01-03T01:00:00', tz=est)],
            [pd.Timestamp('2000-01-04T01:00:00', tz=est)],
        ])
        expected = np.concatenate([expected, new], axis=1)
        result = df.values
        tm.assert_numpy_array_equal(result, expected)
Exemplo n.º 2
0
    def test_assign_bad(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

        # non-keyword argument
        with pytest.raises(TypeError):
            df.assign(lambda x: x.A)
        with pytest.raises(AttributeError):
            df.assign(C=df.A, D=df.A + df.C)
Exemplo n.º 3
0
 def test_assign_alphabetical(self):
     # GH 9818
     df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
     result = df.assign(D=df.A + df.B, C=df.A - df.B)
     expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
                          columns=list('ABCD'))
     assert_frame_equal(result, expected)
     result = df.assign(C=df.A - df.B, D=df.A + df.B)
     assert_frame_equal(result, expected)
Exemplo n.º 4
0
    def test_assign_dependent_old_python(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

        # Key C does not exist at definition time of df
        with pytest.raises(KeyError):
            df.assign(C=lambda df: df.A,
                      D=lambda df: df['A'] + df['C'])
        with pytest.raises(KeyError):
            df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
Exemplo n.º 5
0
    def test_assign_dependent(self):
        df = DataFrame({'A': [1, 2], 'B': [3, 4]})

        result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
        expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
                             columns=list('ABCD'))
        assert_frame_equal(result, expected)

        result = df.assign(C=lambda df: df.A,
                           D=lambda df: df['A'] + df['C'])
        expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
                             columns=list('ABCD'))
        assert_frame_equal(result, expected)
Exemplo n.º 6
0
def subtitle_cat(train_df: pd.DataFrame, clue_word: list):
    # サブタイトル名をもとにカテゴリ変数を作成する
    df = train_df.assign(heading_cat = np.nan)
    df.loc[df.heading.str.contains(r'NO_SUBTITLE'), 'heading_cat'] = 0
    df.loc[df.heading.str.contains(util.contains_patt(clue_word)), 'heading_cat'] = 1
    df.loc[df.heading_cat.isna(), 'heading_cat'] = 2

    return df.heading_cat.astype('category')
Exemplo n.º 7
0
    def test_assign_order(self):
        # GH 9818
        df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
        result = df.assign(D=df.A + df.B, C=df.A - df.B)

        if PY36:
            expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
                                 columns=list('ABDC'))
        else:
            expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
                                 columns=list('ABCD'))
        assert_frame_equal(result, expected)
        result = df.assign(C=df.A - df.B, D=df.A + df.B)

        expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
                             columns=list('ABCD'))

        assert_frame_equal(result, expected)
Exemplo n.º 8
0
 def test_assign_bad(self):
     df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
     # non-keyword argument
     with tm.assertRaises(TypeError):
         df.assign(lambda x: x.A)
     with tm.assertRaises(AttributeError):
         df.assign(C=df.A, D=df.A + df.C)
     with tm.assertRaises(KeyError):
         df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C'])
     with tm.assertRaises(KeyError):
         df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
Exemplo n.º 9
0
def labeling(sentence_df: pd.DataFrame, train_dict: dict):
    _sentence_df = sentence_df.assign(label = False)
    for _id, train_values in train_dict.items():
        if len(train_values) is 0:
            continue

        _sentence_df.loc[_sentence_df._id == str(_id), 'label'] = \
            _sentence_df.loc[_sentence_df._id == str(_id)].sentence.str.contains(contains_patt(train_values))

    return _sentence_df
Exemplo n.º 10
0
class MergeCategoricals(object):

    def setup(self):
        self.left_object = DataFrame(
            {'X': np.random.choice(range(0, 10), size=(10000,)),
             'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})

        self.right_object = DataFrame(
            {'X': np.random.choice(range(0, 10), size=(10000,)),
             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})

        self.left_cat = self.left_object.assign(
            Y=self.left_object['Y'].astype('category'))
        self.right_cat = self.right_object.assign(
            Z=self.right_object['Z'].astype('category'))

    def time_merge_object(self):
        merge(self.left_object, self.right_object, on='X')

    def time_merge_cat(self):
        merge(self.left_cat, self.right_cat, on='X')
Exemplo n.º 11
0
    def test_assign(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        original = df.copy()
        result = df.assign(C=df.B / df.A)
        expected = df.copy()
        expected['C'] = [4, 2.5, 2]
        assert_frame_equal(result, expected)

        # lambda syntax
        result = df.assign(C=lambda x: x.B / x.A)
        assert_frame_equal(result, expected)

        # original is unmodified
        assert_frame_equal(df, original)

        # Non-Series array-like
        result = df.assign(C=[4, 2.5, 2])
        assert_frame_equal(result, expected)
        # original is unmodified
        assert_frame_equal(df, original)

        result = df.assign(B=df.B / df.A)
        expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
        assert_frame_equal(result, expected)

        # overwrite
        result = df.assign(A=df.A + df.B)
        expected = df.copy()
        expected['A'] = [5, 7, 9]
        assert_frame_equal(result, expected)

        # lambda
        result = df.assign(A=lambda x: x.A + x.B)
        assert_frame_equal(result, expected)
Exemplo n.º 12
0
 def fix_tickets(
         self, ticket_frame: pd.DataFrame, path_fixes) -> pd.DataFrame:
     ticket_frame.rename(
         columns={'Total changed lines': 'ChangedLines'}, inplace=True)
     ticket_frame = ticket_frame[
         ticket_frame.ChangedLines < 100000]
     ticket_frame = ticket_frame.assign(
         ChangedFiles=ticket_frame['Changed files'].apply(
         partial(self.fix_path_prefixes, path_fixes)))
     fixed_frame = ticket_frame.drop(
         'Changed files', axis=1).sort_values(
         by='CommitDate').reset_index(drop=True)
     fixed_frame.fillna(value={'Found': ''}, axis=0, inplace=True)
     return fixed_frame
Exemplo n.º 13
0
def get_subtitle(sentence_df: pd.DataFrame, wiki_dump_data: list):
    df = sentence_df.assign(heading = '')
    new_train_df = pd.DataFrame()
    for _id in df._id.unique():
        article_df = df.loc[df._id == _id]
        
        row_article = [entry for entry in wiki_dump_data if entry['index']['_id'] == _id][0]
        parsed = wtp.parse(row_article['source_text'])
        for source in parsed.sections[1:]:
            heading = _search_subtitle(source.string)
            section_text = _clean_source_text(source)
            article_df = _get_subtitle_of_sentence(article_df, section_text, heading)
        
        article_df = _complement_subtitle(article_df)
        new_train_df = new_train_df.append(article_df)

    return new_train_df
Exemplo n.º 14
0
 def pipe_age_metadata(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(
         date=clean_date_series(df.Date),
         location=self.location,
     )
Exemplo n.º 15
0
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(total_vaccinations=df.people_fully_vaccinated +
                     df.people_vaccinated)
Exemplo n.º 16
0
def enrich_location(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(location="Estonia")
Exemplo n.º 17
0
# m.to_csv(open('Data/X.csv', 'w'))
# dataset = dataset1
import traceback




dataset1 = dataset.select_dtypes(include=[np.number])
use_field = list(dataset1.columns.values)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(dataset1)
dataset1 = imp.transform(dataset1)
dataset1 = DataFrame(dataset1, columns=use_field)
dataset1 = concat_free_money(dataset1)
dataset1 = throw_outliers(dataset1)
dataset1 = dataset1.assign(SUBS_ID=dataset.SUBS_ID)
dataset1 = pd.merge(dataset1, dataset2, on='SUBS_ID', how='left')
dataset1 = pd.merge(dataset1, read_csv('Data/X2.csv'), on='SUBS_ID', how='left')
# print(use_field)
gr = dataset1.groupby('SUBS_ID')
dataset = gr.mean()
dataset1 = dataset.copy()

dataset1 = dataset1.drop(['AGE_GROUP1', 'AGE_GROUP2'], axis=1)
print(len(dataset1.columns.values))
# dataset1 = dataset1['SUBS_ID']
      # (dataset.columns.values[0]))
# dataset1 = preprocessing.scale(dataset1)

TRAIN_PART = 6/5
train = dataset1[:int(len(dataset1)/TRAIN_PART)]
Exemplo n.º 18
0
def enrich_source(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(source_url="https://opendata.digilugu.ee")
Exemplo n.º 19
0
 def pipe_source(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(source_url=self.source_url_ref)
Exemplo n.º 20
0
 def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(Date=clean_date_series(df["Date"], "%d %b %y"))
Exemplo n.º 21
0
 def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
     df = df.assign(
         date=df.date.apply(clean_date, fmt="%Y-%m-%d", minus_days=1))
     return df
Exemplo n.º 22
0
def regression(
    col1: StContainer,
    col2: StContainer,
    learning_data: pd.DataFrame,
    target: str,
    test_size: float,
    stratify: str,
) -> None:
    """Regrese v dashboardu"""

    # rozdělení na trénovací a testovací data
    y = learning_data[target]
    X = learning_data.drop(columns=[target])
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        stratify=stratify)

    with col1.beta_expander("Výběr modelu"):
        model = st.selectbox("Regresní model", list(REGRESSION_MODELS))
        # hodnoty hyperparametrů si uložíme do slovníku typu {jméno hyperparametru: hodnota}
        hyperparams = {
            hyperparam: widget()
            for hyperparam, widget in REGRESSION_MODELS[model]
            ["hyperparams"].items()
        }
        metric = st.selectbox("Metrika", list(METRICS))

    # REGRESSION_MODELS[model]["class"] vrací třídu regresoru, např. LinearRegression
    # ve slovníku hyperparams máme uložené hodnoty hyperparametrů od uživatele
    # takto tedy můžeme vytvořit příslušný regresor
    regressor = REGRESSION_MODELS[model]["class"](**hyperparams)
    # zkusíme natrénovat model
    try:
        regressor.fit(X_train, y_train)
    except Exception as prediction_error:
        # v případě chyby ukážeme uživateli co se stalo
        st.error(f"Chyba při fitování modelu: {prediction_error}")
        # a nebudeme už nic dalšího zobrazovat
        return

    # predikce pomocí natrénovaného modelu
    y_predicted = regressor.predict(X_test)
    prediction_error = METRICS[metric](y_predicted, y_test)

    col2.header(f"Výsledek modelu {model}")
    col2.write(f"{metric}: {prediction_error:.3g}")

    # vytvoříme pomocný dataframe s se sloupcem s predikcí
    predicted_target_column = f"{target} - predicted"
    complete_data = learning_data.assign(
        **{predicted_target_column: regressor.predict(X)})
    # vykreslíme správné vs predikované body
    fig = px.scatter(complete_data, x=target, y=predicted_target_column)
    # přidáme čáru ukazující ideální predikci
    fig.add_trace(
        go.Scatter(
            x=[complete_data[target].min(), complete_data[target].max()],
            y=[complete_data[target].min(), complete_data[target].max()],
            mode="lines",
            line=dict(width=2, color="DarkSlateGrey"),
            name="ideal prediction",
        ))
    col2.write(fig)
Exemplo n.º 23
0
def format_date(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(
        date=pd.to_datetime(df.date, format="%d-%m-%Y").astype(str))
Exemplo n.º 24
0
 def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(date=self._parse_date())
Exemplo n.º 25
0
 def pipe_metadata(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(location=self.location,
                      source_url=self.source_url_ref)
Exemplo n.º 26
0
 def pipe_one_dose_correction(self, df: pd.DataFrame) -> pd.DataFrame:
     single_shot = df.people_fully_vaccinated - df.VacAd2Dose
     return df.assign(people_vaccinated=df.people_vaccinated + single_shot)
Exemplo n.º 27
0
def enrich_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(location="Portugal",
                     source_url="https://github.com/dssg-pt/covid19pt-data")
Exemplo n.º 28
0
#len(rate_with_go_rmdup_rmna.index)
#10314
for go_term in rate_with_go_rmdup_rmna['GO Term Accession'].drop_duplicates(keep='first'):
    list1 = rate_with_go_rmdup_rmna[rate_with_go_rmdup_rmna['GO Term Accession'] == go_term]['Snake_island_viper_6']
    list2 = rate_with_go_rmdup_rmna[rate_with_go_rmdup_rmna['GO Term Accession'] == go_term]['Black_brow_viper_4']
#paired    u, pvalue = scipy.stats.wilcoxon(list1,list2)
    u, pvalue = scipy.stats.mannwhitneyu(list1,list2,alternative='greater')
    go_u_pvalue.append([go_term,u,pvalue])
#4620
    if pvalue < 0.05:
        go_u_pvalue_sign.append([go_term,u,pvalue])
#7
"""
#go_u_pvalue_sign_df['lable'] = Series(['greater']*len(go_u_pvalue_sign_df), index=go_u_pvalue_sign_df.index)
go_u_pvalue_sign_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value']
go_u_pvalue_sign_df_label = go_u_pvalue_sign_df.assign(label = Series(['greater']*len(go_u_pvalue_sign_df)))
go_u_pvalue_sign_less_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value']
go_u_pvalue_sign_less_df_label = go_u_pvalue_sign_less_df.assign(label = Series(['less']*len(go_u_pvalue_sign_less_df)))
go_u_pvalue_sign_gl_df = pd.concat([go_u_pvalue_sign_df_label,go_u_pvalue_sign_less_df_label],axis=0,ignore_index=True)
#go_u_pvalue_sign_gl_df_sort = go_u_pvalue_sign_gl_df.sort_values(by='GO Term Accession',ascending=True)

"""
go_u_pvalue_sign_gl_df = go_u_pvalue_sign_df.append(go_u_pvalue_sign_less_df,ignore_index=True)
go_u_pvalue_sign_gl_df = pd.concat([go_u_pvalue_sign_df,go_u_pvalue_sign_less_df],axis=0,ignore_index=True)
go_u_pvalue_sign_gl_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value']
go_u_pvalue_sign_gl_df.assign(lable = Series(['greater']*len(go_u_pvalue_sign_df)+['less']*len(go_u_pvalue_sign_less_df)))
"""

go_rate_with_sign_p = pd.merge(GO_mean_rate.reset_index().ix[:,["GO Term Accession",species_focus,species_backgroud]],go_u_pvalue_sign_gl_df,on="GO Term Accession",how='inner',sort=True)
mart_go_uniq_ann = pd.read_table('human_mart_export_GOuniq_sort.txt',header = 'infer',skiprows=[0],skip_blank_lines=True)
go_rate_with_sign_p_ann = pd.merge(mart_go_uniq_ann,go_rate_with_sign_p,on="GO Term Accession",how='inner')
Exemplo n.º 29
0
 def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(vaccine=df.VaccApprov.apply(self._map_vaccines))
Exemplo n.º 30
0
 def test_assign_multiple(self):
     df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
     result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
     expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
                           [3, 6, 9, 3, 6]], columns=list('ABCDE'))
     assert_frame_equal(result, expected)
def format_date(input: pd.DataFrame) -> pd.DataFrame:
    return input.assign(
        date=pd.to_datetime(input.date, format="%d/%m/%Y").dt.date)
def enrich_columns(input: pd.DataFrame) -> pd.DataFrame:
    return input.assign(
        location="Ecuador",
        source_url="https://github.com/andrab/ecuacovid",
        vaccine="Pfizer/BioNTech",
    )
Exemplo n.º 33
0
 def pipe_total_vaccinations(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.assign(total_vaccinations=df.dose_1 + df.dose_2 + df.dose_3)