def test_serialize_examples_from_dictionary(): """This module has (3) methods of serializing peritem """ """Set up how I want to assign labels to objects Reciprocal will cause labels to be the inverse of the loss metric Set to True if I do not want labels to be binned""" reciprocal = False # Reciprocal of relevance label - use if you dont bin labels n_bins = 5 # number of bins for relevance label label_key = 'relevance' # These are peritem featuer columns names peritem_keys = ['by_size','n_components','clusterer','reduce','index'] # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get all records relating to one customer customer_id = 15 sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] correct_k = res[0].correct_k sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name # Calculate ranking of all records records = get_records(primary_keys) best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) example_features = [] for label in best_labels: feature_dict = {} for key in peritem_keys: feature_dict[key] = label.hyperparameter_dict[key] feature_dict[label_key] = label.loss example_features.append(feature_dict) serialized_example = serialize_examples_from_dictionary(example_features, label_key, peritem_keys, reciprocal=reciprocal, n_bins=n_bins, shuffle_peritem=True) return serialized_example
def test_get_database_labels(): # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get all records relating to one customer customer_id = 15 sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] correct_k = res[0].correct_k sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name # Calculate ranking of all records records = get_records(primary_keys) best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) return best_labels
ExtractLabels = Labeling.ExtractLabels() #%% Extract labels for all datasets # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get all records relating to one customer customer_id = 15 sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] correct_k = res[0].correct_k sel = sqlalchemy.select([Customers.name ]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name # Calculate ranking of all records records = get_records(primary_keys) best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) #%% Get the best clustering hyperparameter for a dataset
def save_tfrecord_sql(customer_ids, peritem_keys, label_key, reciprocal, n_bins, tfrecord_writer): """Save TFRecord EIE format to files for ranking See rank_write_record.py for how Mongo database documents are converted to tf.train.Example objects Here the tf.train.Example objects are nested into the Example-in-example format recommended by tensorflow ranking library EIE Examples are of the form {'serialized_context':tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])), 'serialized_examples': tf.train.Feature(bytes_list=tf.train.BytesList(value=value))} for 'serialized_context' value is a serialized tf.train.Example for 'serialized_examples' value is a list of serialized tf.train.Example objects that will be ranked according to their relevance to the context features Inputs ------- customer_ids : (list) of customer_ids in SQL database to save peritem_keys : (list) of string keys that exist in peritem_features. Should be ['by_size','n_components','clusterer','reduce','index'] reciprocal : (bool) Set up how I want to assign labels to objects Reciprocal will cause labels to be the inverse of the loss metric Set to True if I do not want labels to be binned n_bins : (int) number of bins for relevance label if reciprocal is False tfrecord_writer : (tf.io.TFRecordWriter) To serialized EIE TFRecord """ """Create a pipeline for transforming points databases""" assert hasattr(customer_ids, '__iter__'), "customer_ids must be iterable" msg = "Each ID in customer_ids must be int type, not {}" for _id in customer_ids: assert isinstance(_id, int), msg.format(type(_id)) Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) full_pipeline = Pipeline([ ('clean_pipe', clean_pipe), ('text_pipe', text_pipe), ]) for customer_id in customer_ids: print("Saving TFRecord for Customer ID : {}".format(customer_id)) """Serialize context featuers -> serialized_context This is a serialized tf.train.Example object""" # Get Points databases related to customer_id sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) database = Insert.pandas_select_execute(sel) sel = sqlalchemy.select([Customers.name ]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name if database.shape[0] == 0: print(database.shape) print(customer_name) # Null databases should be skipped continue # Extract database featuers from Points database try: database_features = ExtractLabels.get_database_features( database, full_pipeline, instance_name=customer_name) except Labeling.PipelineError: print("An error occured while getting database features") print("Customer name : {}".format(customer_name)) print("Customer ID : {}".format(customer_id)) print(database) continue context_features = database_features.to_dict(orient='records')[0] context_features.pop('instance') # Create serialized TFRecord proto context_proto_str = serialize_context_from_dictionary(context_features) """Serialize peritem features. AKA examples or instances that will be ranked This is a list of serialized tf.train.Example objects""" # Get a list of Clustering primary keys related to customer_id sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) if len(res) == 0: # No clustering examples were found with the database print("Skipped {} No results".format(customer_name)) continue primary_keys = [x.id for x in res] correct_k = res[0].correct_k # From primary keys create records. Records are used to find # Example features and labels records = get_records(primary_keys) # best_labels.hyperparameter_dict values are the peritem_features # The loss metric related to each hyperparameter_dict are labels to # each example best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) example_features = [] for label in best_labels: feature_dict = {} for key in peritem_keys: feature_dict[key] = label.hyperparameter_dict[key] feature_dict[label_key] = label.loss example_features.append(feature_dict) peritem_list = serialize_examples_from_dictionary( example_features, label_key=label_key, peritem_keys=peritem_keys, reciprocal=reciprocal, n_bins=n_bins, shuffle_peritem=shuffle_peritem) """Prepare serialized feature spec for EIE format""" serialized_dict = { 'serialized_context': _bytes_feature([context_proto_str]), 'serialized_examples': _bytes_feature(peritem_list) } # Convert dictionary to tf.train.Example object serialized_proto = tf.train.Example(features=tf.train.Features( feature=serialized_dict)) serialized_str = serialized_proto.SerializeToString() tfrecord_writer.write(serialized_str) tfrecord_writer.close() return None
def test_plt_best_n_indicies(): records = get_records([8, 9, 10, 11]) plt_best_n_indicies(records) return None
def test_plt_hyperparameters(): records = get_records([8, 9, 10, 11]) plt_hyperparameters(records, hyperparameter_name='clusterer') return None
def test_plt_distance(): records = get_records([8, 9, 10, 11]) plt_distance(records) return None
def test_plt_indicy_accuracy_scatter(): records = get_records([8, 9, 10, 11]) plt_indicy_accuracy_scatter(records) return None
def test_plt_indicy_accuracy_bar(): records = get_records([8]) plt_indicy_accuracy_bar(records) return None
def test_get_records(): primary_keys = [1, 2, 3, 4, 5] records = get_records(primary_keys) return records
def get_hyperparameters_serving(): """The ranking model imputs a tensor of context features and per-item features The per-item features are clusterering hyperparameters turned to indicator columns. In order to predict on a new database, you must input the per-item clustering hyperparameters into the model. In training, I have been doing this with actual recorded hyperparameters For prediction I must generate the clustering hyperparameters. These must be known before this module will generate an array of clustering hyperparameters like: [['False', 'kmeans', '8', 'TSNE', 'optk_TSNE_gap*_max'], ['True', 'ward.D', '8', 'MDS', 'SDbw'], [...]] This can be fed to tf.feature_columns or TFRecords in order to generate inputs to a ranking model for prediction """ # Instantiate a class for reading SQL data Insert = extract.Insert(server_name, driver_name, database_name) """Get most frequent hyperparameter occurences for each customer Customer IDs are used to retrieve clustering results for each customer""" sel = sqlalchemy.select([Customers.id]) customer_ids = Insert.core_select_execute(sel) # Keep track of the best clustering hyperparameters for all datasets all_labels = [] for _id in customer_ids: customer_id = _id.id """Get primary key of clusterings related to customer Each primary key is used to create Record objects with get_records""" sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] # Create records for feeding while calculating the best labels records = get_records(primary_keys) if records.__len__() <= 1: # Not enough examples to append continue sel = sqlalchemy.select([Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id))\ .limit(1) res = Insert.core_select_execute(sel) correct_k = res[0].correct_k """best_labels is a list of namedtuple objects each tuple has a name hyperparameter_dict which contains hyperparameters used to cluster that customers database A unique list of clustering hyperparameters will be used for model serving""" best_labels = ClusteringLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) """Keep the 10 best best_lables for each customer_id The idea is we should predict between some of the best available hyperparameters for ranking model""" if best_labels.__len__() > 10: for i in range(0,10): all_labels.append(best_labels[i]) else: n = int(best_labels.__len__() * 0.5) for i in range(n): all_labels.append(best_labels[i]) """Each hyperparameter_dict in all_labels is not unique To create a unique set of dictionary values use the frozenset object The frozenset is hashable (unlike normal set) which means it can be used in Counter objects""" hyperparams = [] for x in all_labels: y = x.hyperparameter_dict # Dictionary hyperparams_set = frozenset(y.values()) hyperparams.append(hyperparams_set) # Counter objects create a set from hyperparams c = Counter(hyperparams) c.most_common() """Convert to dictionary and save in list Convert hyperparameter frozenset back to a nomral dictionary""" hyperparameters_serving = [] for x in c.keys(): hyperparameter_dict = ClusteringLabels._hyperparameter_set_2_dict(x) hyperparameters_serving.append(hyperparameter_dict) return hyperparameters_serving