Exemplo n.º 1
0
def prune_characters(char_occ_dict, threshold=0.1):
    from dirty_cat import SimilarityEncoder
    from sklearn.preprocessing import minmax_scale
    from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
    from scipy.spatial.distance import squareform
    simenc = SimilarityEncoder(similarity='jaro-winkler')
    transf = simenc.fit_transform(np.array(sorted(char_occ_dict.keys())).reshape(-1, 1))
    corr_dist = minmax_scale(-transf)
    dense_distance = squareform(corr_dist, checks=False)
    Z = linkage(dense_distance, 'average', optimal_ordering=True)
    return get_merged_characters(Z, char_occ_dict, threshold=threshold)
Exemplo n.º 2
0
        "LastPerformanceReview_Date",
        "EmpStatusID",
        "TermReason",
    ],
    axis=1,
)

X.info()
date_cols = X.select_dtypes("datetime")
for col in date_cols:
    X = encode_dates(X, col)

encode_columns = ["Employee_Name", "Position", "ManagerName"]
enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=4)
for col in encode_columns:
    transformed_values = enc.fit_transform(X[col].values.reshape(-1, 1))
    transformed_values = pd.DataFrame(transformed_values, index=X.index)
    transformed_values.columns = [f"{col}_" + str(num) for num in transformed_values]
    X = pd.concat([X, transformed_values], axis=1)
    X = X.drop(col, axis=1)

obj_cols = X.select_dtypes("object").columns
X[obj_cols] = X[obj_cols].astype("category")


SEED = 0
SAMPLE_SIZE = 5000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED
)  # split into train and validation set
Exemplo n.º 3
0
# data
values = data[['Employee Position Title', 'Gender', 'Current Annual Salary']]

#########################################################################
# String similarity between entries
# -------------------------------------------------
#
# That's where our encoders get into play. In order to robustly
# embed dirty semantic data, the SimilarityEncoder creates a similarity
# matrix based on the 3-gram structure of the data.
sorted_values = values['Employee Position Title'].sort_values().unique()

from dirty_cat import SimilarityEncoder

similarity_encoder = SimilarityEncoder(similarity='ngram')
transformed_values = similarity_encoder.fit_transform(
    sorted_values.reshape(-1, 1))

#########################################################################
# Plotting the new representation using multi-dimensional scaling
# ................................................................
#
# lets now plot a couple points at random using a low-dimensional representation
# to get an intuition of what the similarity encoder is doing:
from sklearn.manifold import MDS

mds = MDS(dissimilarity='precomputed', n_init=10, random_state=42)
two_dim_data = mds.fit_transform(1 -
                                 transformed_values)  # transformed values lie
# in the 0-1 range, so 1-transformed_value yields a positive dissimilarity matrix
print(two_dim_data.shape)
print(sorted_values.shape)
Exemplo n.º 4
0
string_columns = metadata_df.select_dtypes(include=['object']).columns
# [(col, metadata_df[col].map(type).unique()) for col in string_columns]
string_columns = string_columns.tolist()
string_columns.remove('data')
metadata_df['all'] = metadata_df[string_columns].apply(
    lambda row: row.str.cat(sep=' '), axis=1)

vectorizer = CountVectorizer()

corpus = [metadata_df.iloc[ii]['all'] for ii in range(metadata_df.shape[0])]
bag_of_words = vectorizer.fit_transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
              for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
frequent_words = [w[0] for w in words_freq]

print(' '.join(frequent_words[:100]))

sdfdf

se = SimilarityEncoder(similarity='ngram', handle_unknown='ignore')
y = se.fit_transform(metadata_df.name)  # XXX: need more features than 1 ...

sdfdf

# look at the metadata
if True:
    metadata_df.to_csv('metadata.csv', encoding='utf-8')