Пример #1
0
    def test_converts_nan_to_empty_strings(self):
        data = {"A": ["A", "B  ", "  C", np.nan]}
        df = pd.DataFrame(data)
        data2 = {"A": ["A", "B", "C", ""]}
        expected = pd.DataFrame(data2)

        remove_whitespace(df)

        assert_frame_equal(df, expected)
Пример #2
0
    def test_handles_empty_strings(self):
        data = {"A": ["A", "B  ", "  C", " "]}
        df = pd.DataFrame(data)
        data2 = {"A": ["A", "B", "C", ""]}
        expected = pd.DataFrame(data2)

        remove_whitespace(df)

        assert_frame_equal(df, expected)
Пример #3
0
    def test_remove_leading_and_trailing_spaces_from_dataframe(self):
        data = {
            "A": ["A", "B ", "  C", "D  ", "  Ed  ", " 1 "],
            "B": ["Aa", "Bb ", "  Cc", "Dd  ", "  Ed Ed  ", " 11 "],
        }
        df = pd.DataFrame(data)
        data2 = {
            "A": ["A", "B", "C", "D", "Ed", "1"],
            "B": ["Aa", "Bb", "Cc", "Dd", "Ed Ed", "11"],
        }
        expected = pd.DataFrame(data2)

        remove_whitespace(df)

        assert_frame_equal(df, expected)
Пример #4
0
    def test_ignores_numeric_columns(self):
        data = {
            "A": ["A", "B  ", "  C"],
            "B": [1, 2, 3],
            "C": [1.1, 2.2, 3.3],
        }
        df = pd.DataFrame(data)
        data2 = {
            "A": ["A", "B", "C"],
            "B": [1, 2, 3],
            "C": [1.1, 2.2, 3.3],
        }
        expected = pd.DataFrame(data2)

        remove_whitespace(df)

        assert_frame_equal(df, expected)
Пример #5
0
def create_taxa_list_df(df):
    fields = taxa_rank_fields + taxa_fields + pdbd_fields
    print("fields:", fields)

    filtered_taxa = pd.DataFrame(df, columns=fields)
    remove_whitespace(filtered_taxa)
    print("initial df: ", filtered_taxa.shape)

    add_normalized_name_column(filtered_taxa)

    filtered_taxa = filtered_taxa.drop(
        filtered_taxa[filtered_taxa["normalized_name"] == ""].index)
    print("remove nontaxa df: ", filtered_taxa.shape)

    filtered_taxa.drop_duplicates(keep="first",
                                  inplace=True,
                                  subset=["normalized_name", "taxon_group"])
    print("drop duplicates df: ", filtered_taxa.shape)

    return filtered_taxa