Python get_records示例，clustering.accuracy_visualize.get_records Python示例

示例#1

0

显示文件

文件： rank_write_record_test.py 项目： johnvorsten/point_categorizer

def test_serialize_examples_from_dictionary():
    """This module has (3) methods of serializing peritem """

    """Set up how I want to assign labels to objects
    Reciprocal will cause labels to be the inverse of the loss metric
    Set to True if I do not want labels to be binned"""
    reciprocal = False # Reciprocal of relevance label - use if you dont bin labels
    n_bins = 5 # number of bins for relevance label

    label_key = 'relevance'

    # These are peritem featuer columns names
    peritem_keys = ['by_size','n_components','clusterer','reduce','index']


    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get all records relating to one customer
    customer_id = 15
    sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
        .where(Clustering.customer_id.__eq__(customer_id))
    res = Insert.core_select_execute(sel)
    primary_keys = [x.id for x in res]
    correct_k = res[0].correct_k

    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    # Calculate ranking of all records
    records = get_records(primary_keys)
    best_labels = ExtractLabels.calc_labels(records, correct_k,
                                            error_scale=0.8, var_scale=0.2)
    example_features = []
    for label in best_labels:
        feature_dict = {}
        for key in peritem_keys:
            feature_dict[key] = label.hyperparameter_dict[key]
        feature_dict[label_key] = label.loss
        example_features.append(feature_dict)

    serialized_example = serialize_examples_from_dictionary(example_features,
                                       label_key,
                                       peritem_keys,
                                       reciprocal=reciprocal,
                                       n_bins=n_bins,
                                       shuffle_peritem=True)

    return serialized_example

示例#2

0

显示文件

def test_get_database_labels():

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get all records relating to one customer
    customer_id = 15
    sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
        .where(Clustering.customer_id.__eq__(customer_id))
    res = Insert.core_select_execute(sel)
    primary_keys = [x.id for x in res]
    correct_k = res[0].correct_k

    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    # Calculate ranking of all records
    records = get_records(primary_keys)
    best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2)

    return best_labels

示例#3

0

显示文件

文件： Labeling_main.py 项目： johnvorsten/point_categorizer

ExtractLabels = Labeling.ExtractLabels()

#%% Extract labels for all datasets

# Set up connection to SQL
Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                        driver_name='SQL Server Native Client 10.0',
                        database_name='Clustering')

# Get all records relating to one customer
customer_id = 15
sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
    .where(Clustering.customer_id.__eq__(customer_id))
res = Insert.core_select_execute(sel)
primary_keys = [x.id for x in res]
correct_k = res[0].correct_k

sel = sqlalchemy.select([Customers.name
                         ]).where(Customers.id.__eq__(customer_id))
customer_name = Insert.core_select_execute(sel)[0].name

# Calculate ranking of all records
records = get_records(primary_keys)
best_labels = ExtractLabels.calc_labels(records,
                                        correct_k,
                                        error_scale=0.8,
                                        var_scale=0.2)

#%% Get the best clustering hyperparameter for a dataset

示例#4

0

显示文件

文件： rank_write_record_main.py 项目： johnvorsten/point_categorizer

def save_tfrecord_sql(customer_ids, peritem_keys, label_key, reciprocal,
                      n_bins, tfrecord_writer):
    """Save TFRecord EIE format to files for ranking
    See rank_write_record.py for how Mongo database documents are
    converted to tf.train.Example objects
    Here the tf.train.Example objects are nested into the Example-in-example format
    recommended by tensorflow ranking library

    EIE Examples are of the form
     {'serialized_context':tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])),
      'serialized_examples': tf.train.Feature(bytes_list=tf.train.BytesList(value=value))}
     for 'serialized_context' value is a serialized tf.train.Example
     for 'serialized_examples' value is a list of serialized tf.train.Example
     objects that will be ranked according to their relevance to the context
     features

     Inputs
     -------
     customer_ids : (list) of customer_ids in SQL database to save
     peritem_keys : (list) of string keys that exist in peritem_features.
         Should be ['by_size','n_components','clusterer','reduce','index']
    reciprocal : (bool) Set up how I want to assign labels to objects
        Reciprocal will cause labels to be the inverse of the loss metric
        Set to True if I do not want labels to be binned
    n_bins : (int) number of bins for relevance label if reciprocal is False
    tfrecord_writer : (tf.io.TFRecordWriter) To serialized EIE TFRecord
     """
    """Create a pipeline for transforming points databases"""

    assert hasattr(customer_ids, '__iter__'), "customer_ids must be iterable"
    msg = "Each ID in customer_ids must be int type, not {}"
    for _id in customer_ids:
        assert isinstance(_id, int), msg.format(type(_id))

    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([
        ('clean_pipe', clean_pipe),
        ('text_pipe', text_pipe),
    ])

    for customer_id in customer_ids:
        print("Saving TFRecord for Customer ID : {}".format(customer_id))
        """Serialize context featuers -> serialized_context
        This is a serialized tf.train.Example object"""
        # Get Points databases related to customer_id
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        database = Insert.pandas_select_execute(sel)
        sel = sqlalchemy.select([Customers.name
                                 ]).where(Customers.id.__eq__(customer_id))
        customer_name = Insert.core_select_execute(sel)[0].name
        if database.shape[0] == 0:
            print(database.shape)
            print(customer_name)
            # Null databases should be skipped
            continue
        # Extract database featuers from Points database
        try:
            database_features = ExtractLabels.get_database_features(
                database, full_pipeline, instance_name=customer_name)
        except Labeling.PipelineError:
            print("An error occured while getting database features")
            print("Customer name : {}".format(customer_name))
            print("Customer ID : {}".format(customer_id))
            print(database)
            continue
        context_features = database_features.to_dict(orient='records')[0]
        context_features.pop('instance')
        # Create serialized TFRecord proto
        context_proto_str = serialize_context_from_dictionary(context_features)
        """Serialize peritem features. AKA examples or instances that will be ranked
        This is a list of serialized tf.train.Example objects"""
        # Get a list of Clustering primary keys related to customer_id
        sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))
        res = Insert.core_select_execute(sel)
        if len(res) == 0:
            # No clustering examples were found with the database
            print("Skipped {} No results".format(customer_name))
            continue
        primary_keys = [x.id for x in res]
        correct_k = res[0].correct_k
        # From primary keys create records. Records are used to find
        # Example features and labels
        records = get_records(primary_keys)
        # best_labels.hyperparameter_dict values are the peritem_features
        # The loss metric related to each hyperparameter_dict are labels to
        # each example
        best_labels = ExtractLabels.calc_labels(records,
                                                correct_k,
                                                error_scale=0.8,
                                                var_scale=0.2)
        example_features = []
        for label in best_labels:
            feature_dict = {}
            for key in peritem_keys:
                feature_dict[key] = label.hyperparameter_dict[key]
            feature_dict[label_key] = label.loss
            example_features.append(feature_dict)

        peritem_list = serialize_examples_from_dictionary(
            example_features,
            label_key=label_key,
            peritem_keys=peritem_keys,
            reciprocal=reciprocal,
            n_bins=n_bins,
            shuffle_peritem=shuffle_peritem)
        """Prepare serialized feature spec for EIE format"""
        serialized_dict = {
            'serialized_context': _bytes_feature([context_proto_str]),
            'serialized_examples': _bytes_feature(peritem_list)
        }

        # Convert dictionary to tf.train.Example object
        serialized_proto = tf.train.Example(features=tf.train.Features(
            feature=serialized_dict))
        serialized_str = serialized_proto.SerializeToString()

        tfrecord_writer.write(serialized_str)

    tfrecord_writer.close()

    return None

示例#5

0

显示文件