def test_serialize_examples_from_dictionary():
    """This module has (3) methods of serializing peritem """

    """Set up how I want to assign labels to objects
    Reciprocal will cause labels to be the inverse of the loss metric
    Set to True if I do not want labels to be binned"""
    reciprocal = False # Reciprocal of relevance label - use if you dont bin labels
    n_bins = 5 # number of bins for relevance label

    label_key = 'relevance'

    # These are peritem featuer columns names
    peritem_keys = ['by_size','n_components','clusterer','reduce','index']


    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get all records relating to one customer
    customer_id = 15
    sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
        .where(Clustering.customer_id.__eq__(customer_id))
    res = Insert.core_select_execute(sel)
    primary_keys = [x.id for x in res]
    correct_k = res[0].correct_k

    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    # Calculate ranking of all records
    records = get_records(primary_keys)
    best_labels = ExtractLabels.calc_labels(records, correct_k,
                                            error_scale=0.8, var_scale=0.2)
    example_features = []
    for label in best_labels:
        feature_dict = {}
        for key in peritem_keys:
            feature_dict[key] = label.hyperparameter_dict[key]
        feature_dict[label_key] = label.loss
        example_features.append(feature_dict)

    serialized_example = serialize_examples_from_dictionary(example_features,
                                       label_key,
                                       peritem_keys,
                                       reciprocal=reciprocal,
                                       n_bins=n_bins,
                                       shuffle_peritem=True)

    return serialized_example
Пример #2
0
def test_get_database_labels():

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get all records relating to one customer
    customer_id = 15
    sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
        .where(Clustering.customer_id.__eq__(customer_id))
    res = Insert.core_select_execute(sel)
    primary_keys = [x.id for x in res]
    correct_k = res[0].correct_k

    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    # Calculate ranking of all records
    records = get_records(primary_keys)
    best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2)

    return best_labels
ExtractLabels = Labeling.ExtractLabels()

#%% Extract labels for all datasets

# Set up connection to SQL
Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                        driver_name='SQL Server Native Client 10.0',
                        database_name='Clustering')

# Get all records relating to one customer
customer_id = 15
sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
    .where(Clustering.customer_id.__eq__(customer_id))
res = Insert.core_select_execute(sel)
primary_keys = [x.id for x in res]
correct_k = res[0].correct_k

sel = sqlalchemy.select([Customers.name
                         ]).where(Customers.id.__eq__(customer_id))
customer_name = Insert.core_select_execute(sel)[0].name

# Calculate ranking of all records
records = get_records(primary_keys)
best_labels = ExtractLabels.calc_labels(records,
                                        correct_k,
                                        error_scale=0.8,
                                        var_scale=0.2)

#%% Get the best clustering hyperparameter for a dataset
def save_tfrecord_sql(customer_ids, peritem_keys, label_key, reciprocal,
                      n_bins, tfrecord_writer):
    """Save TFRecord EIE format to files for ranking
    See rank_write_record.py for how Mongo database documents are
    converted to tf.train.Example objects
    Here the tf.train.Example objects are nested into the Example-in-example format
    recommended by tensorflow ranking library

    EIE Examples are of the form
     {'serialized_context':tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])),
      'serialized_examples': tf.train.Feature(bytes_list=tf.train.BytesList(value=value))}
     for 'serialized_context' value is a serialized tf.train.Example
     for 'serialized_examples' value is a list of serialized tf.train.Example
     objects that will be ranked according to their relevance to the context
     features

     Inputs
     -------
     customer_ids : (list) of customer_ids in SQL database to save
     peritem_keys : (list) of string keys that exist in peritem_features.
         Should be ['by_size','n_components','clusterer','reduce','index']
    reciprocal : (bool) Set up how I want to assign labels to objects
        Reciprocal will cause labels to be the inverse of the loss metric
        Set to True if I do not want labels to be binned
    n_bins : (int) number of bins for relevance label if reciprocal is False
    tfrecord_writer : (tf.io.TFRecordWriter) To serialized EIE TFRecord
     """
    """Create a pipeline for transforming points databases"""

    assert hasattr(customer_ids, '__iter__'), "customer_ids must be iterable"
    msg = "Each ID in customer_ids must be int type, not {}"
    for _id in customer_ids:
        assert isinstance(_id, int), msg.format(type(_id))

    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([
        ('clean_pipe', clean_pipe),
        ('text_pipe', text_pipe),
    ])

    for customer_id in customer_ids:
        print("Saving TFRecord for Customer ID : {}".format(customer_id))
        """Serialize context featuers -> serialized_context
        This is a serialized tf.train.Example object"""
        # Get Points databases related to customer_id
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        database = Insert.pandas_select_execute(sel)
        sel = sqlalchemy.select([Customers.name
                                 ]).where(Customers.id.__eq__(customer_id))
        customer_name = Insert.core_select_execute(sel)[0].name
        if database.shape[0] == 0:
            print(database.shape)
            print(customer_name)
            # Null databases should be skipped
            continue
        # Extract database featuers from Points database
        try:
            database_features = ExtractLabels.get_database_features(
                database, full_pipeline, instance_name=customer_name)
        except Labeling.PipelineError:
            print("An error occured while getting database features")
            print("Customer name : {}".format(customer_name))
            print("Customer ID : {}".format(customer_id))
            print(database)
            continue
        context_features = database_features.to_dict(orient='records')[0]
        context_features.pop('instance')
        # Create serialized TFRecord proto
        context_proto_str = serialize_context_from_dictionary(context_features)
        """Serialize peritem features. AKA examples or instances that will be ranked
        This is a list of serialized tf.train.Example objects"""
        # Get a list of Clustering primary keys related to customer_id
        sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))
        res = Insert.core_select_execute(sel)
        if len(res) == 0:
            # No clustering examples were found with the database
            print("Skipped {} No results".format(customer_name))
            continue
        primary_keys = [x.id for x in res]
        correct_k = res[0].correct_k
        # From primary keys create records. Records are used to find
        # Example features and labels
        records = get_records(primary_keys)
        # best_labels.hyperparameter_dict values are the peritem_features
        # The loss metric related to each hyperparameter_dict are labels to
        # each example
        best_labels = ExtractLabels.calc_labels(records,
                                                correct_k,
                                                error_scale=0.8,
                                                var_scale=0.2)
        example_features = []
        for label in best_labels:
            feature_dict = {}
            for key in peritem_keys:
                feature_dict[key] = label.hyperparameter_dict[key]
            feature_dict[label_key] = label.loss
            example_features.append(feature_dict)

        peritem_list = serialize_examples_from_dictionary(
            example_features,
            label_key=label_key,
            peritem_keys=peritem_keys,
            reciprocal=reciprocal,
            n_bins=n_bins,
            shuffle_peritem=shuffle_peritem)
        """Prepare serialized feature spec for EIE format"""
        serialized_dict = {
            'serialized_context': _bytes_feature([context_proto_str]),
            'serialized_examples': _bytes_feature(peritem_list)
        }

        # Convert dictionary to tf.train.Example object
        serialized_proto = tf.train.Example(features=tf.train.Features(
            feature=serialized_dict))
        serialized_str = serialized_proto.SerializeToString()

        tfrecord_writer.write(serialized_str)

    tfrecord_writer.close()

    return None
def test_plt_best_n_indicies():
    records = get_records([8, 9, 10, 11])
    plt_best_n_indicies(records)
    return None
def test_plt_hyperparameters():
    records = get_records([8, 9, 10, 11])
    plt_hyperparameters(records, hyperparameter_name='clusterer')
    return None
def test_plt_distance():
    records = get_records([8, 9, 10, 11])
    plt_distance(records)
    return None
def test_plt_indicy_accuracy_scatter():
    records = get_records([8, 9, 10, 11])
    plt_indicy_accuracy_scatter(records)
    return None
def test_plt_indicy_accuracy_bar():
    records = get_records([8])
    plt_indicy_accuracy_bar(records)
    return None
def test_get_records():
    primary_keys = [1, 2, 3, 4, 5]
    records = get_records(primary_keys)
    return records
Пример #11
0
def get_hyperparameters_serving():
    """The ranking model imputs a tensor of context features and per-item features
    The per-item features are clusterering hyperparameters turned to indicator
    columns.
    In order to predict on a new database, you must input the per-item
    clustering hyperparameters into the model.
    In training, I have been doing this with actual recorded hyperparameters
    For prediction I must generate the clustering hyperparameters. These must
    be known before this module will generate an array of clustering 
    hyperparameters like: 
    [['False', 'kmeans', '8', 'TSNE', 'optk_TSNE_gap*_max'],
     ['True', 'ward.D', '8', 'MDS', 'SDbw'],
     [...]]
    This can be fed to tf.feature_columns or TFRecords in order to generate
    inputs to a ranking model for prediction
    """

    # Instantiate a class for reading SQL data
    Insert = extract.Insert(server_name,
                            driver_name,
                            database_name)

    """Get most frequent hyperparameter occurences for each customer
    Customer IDs are used to retrieve clustering results for each customer"""
    sel = sqlalchemy.select([Customers.id])
    customer_ids = Insert.core_select_execute(sel)

    # Keep track of the best clustering hyperparameters for all datasets
    all_labels = []

    for _id in customer_ids:
        customer_id = _id.id

        """Get primary key of clusterings related to customer
        Each primary key is used to create Record objects with get_records"""
        sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))
        res = Insert.core_select_execute(sel)
        primary_keys = [x.id for x in res]

        # Create records for feeding while calculating the best labels
        records = get_records(primary_keys)
        if records.__len__() <= 1:
            # Not enough examples to append
            continue
        sel = sqlalchemy.select([Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))\
            .limit(1)
        res = Insert.core_select_execute(sel)
        correct_k = res[0].correct_k
        """best_labels is a list of namedtuple objects
        each tuple has a name hyperparameter_dict which contains hyperparameters
        used to cluster that customers database
        A unique list of clustering hyperparameters will be used for model serving"""
        best_labels = ClusteringLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2)

        """Keep the 10 best best_lables for each customer_id
        The idea is we should predict between some of the best available
        hyperparameters for ranking model"""
        if best_labels.__len__() > 10:
            for i in range(0,10):
                all_labels.append(best_labels[i])
        else:
            n = int(best_labels.__len__() * 0.5)
            for i in range(n):
                all_labels.append(best_labels[i])

    """Each hyperparameter_dict in all_labels is not unique
    To create a unique set of dictionary values use the frozenset object
    The frozenset is hashable (unlike normal set) which means it can be used
    in Counter objects"""
    hyperparams = []
    for x in all_labels:
        y = x.hyperparameter_dict # Dictionary
        hyperparams_set = frozenset(y.values())
        hyperparams.append(hyperparams_set)

    # Counter objects create a set from hyperparams
    c = Counter(hyperparams)
    c.most_common()

    """Convert to dictionary and save in list
    Convert hyperparameter frozenset back to a nomral dictionary"""
    hyperparameters_serving = []
    for x in c.keys():
        hyperparameter_dict = ClusteringLabels._hyperparameter_set_2_dict(x)
        hyperparameters_serving.append(hyperparameter_dict)

    return hyperparameters_serving