def prune_characters(char_occ_dict, threshold=0.1): from dirty_cat import SimilarityEncoder from sklearn.preprocessing import minmax_scale from scipy.cluster.hierarchy import dendrogram, linkage, fcluster from scipy.spatial.distance import squareform simenc = SimilarityEncoder(similarity='jaro-winkler') transf = simenc.fit_transform(np.array(sorted(char_occ_dict.keys())).reshape(-1, 1)) corr_dist = minmax_scale(-transf) dense_distance = squareform(corr_dist, checks=False) Z = linkage(dense_distance, 'average', optimal_ordering=True) return get_merged_characters(Z, char_occ_dict, threshold=threshold)
"LastPerformanceReview_Date", "EmpStatusID", "TermReason", ], axis=1, ) X.info() date_cols = X.select_dtypes("datetime") for col in date_cols: X = encode_dates(X, col) encode_columns = ["Employee_Name", "Position", "ManagerName"] enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=4) for col in encode_columns: transformed_values = enc.fit_transform(X[col].values.reshape(-1, 1)) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_values.columns = [f"{col}_" + str(num) for num in transformed_values] X = pd.concat([X, transformed_values], axis=1) X = X.drop(col, axis=1) obj_cols = X.select_dtypes("object").columns X[obj_cols] = X[obj_cols].astype("category") SEED = 0 SAMPLE_SIZE = 5000 Xt, Xv, yt, yv = train_test_split( X, y, random_state=SEED ) # split into train and validation set
# data values = data[['Employee Position Title', 'Gender', 'Current Annual Salary']] ######################################################################### # String similarity between entries # ------------------------------------------------- # # That's where our encoders get into play. In order to robustly # embed dirty semantic data, the SimilarityEncoder creates a similarity # matrix based on the 3-gram structure of the data. sorted_values = values['Employee Position Title'].sort_values().unique() from dirty_cat import SimilarityEncoder similarity_encoder = SimilarityEncoder(similarity='ngram') transformed_values = similarity_encoder.fit_transform( sorted_values.reshape(-1, 1)) ######################################################################### # Plotting the new representation using multi-dimensional scaling # ................................................................ # # lets now plot a couple points at random using a low-dimensional representation # to get an intuition of what the similarity encoder is doing: from sklearn.manifold import MDS mds = MDS(dissimilarity='precomputed', n_init=10, random_state=42) two_dim_data = mds.fit_transform(1 - transformed_values) # transformed values lie # in the 0-1 range, so 1-transformed_value yields a positive dissimilarity matrix print(two_dim_data.shape) print(sorted_values.shape)
string_columns = metadata_df.select_dtypes(include=['object']).columns # [(col, metadata_df[col].map(type).unique()) for col in string_columns] string_columns = string_columns.tolist() string_columns.remove('data') metadata_df['all'] = metadata_df[string_columns].apply( lambda row: row.str.cat(sep=' '), axis=1) vectorizer = CountVectorizer() corpus = [metadata_df.iloc[ii]['all'] for ii in range(metadata_df.shape[0])] bag_of_words = vectorizer.fit_transform(corpus) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) frequent_words = [w[0] for w in words_freq] print(' '.join(frequent_words[:100])) sdfdf se = SimilarityEncoder(similarity='ngram', handle_unknown='ignore') y = se.fit_transform(metadata_df.name) # XXX: need more features than 1 ... sdfdf # look at the metadata if True: metadata_df.to_csv('metadata.csv', encoding='utf-8')