예제 #1
0
def test_get_text_vocabulary():
    """Generate data to find Vocabulary"""
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    sel = sqlalchemy.select([Points])
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=True,
                                             remove_virtual=True)

    df_clean = clean_pipe.fit_transform(dataset_raw)

    # Get vocabulary for DESCRIPTOR feature - a text feature
    VocabularyText = transform_pipeline.VocabularyText()
    vocabulary = VocabularyText\
        .get_text_vocabulary(X=df_clean,
                             col_name='DESCRIPTOR',
                             remove_suffix=False,
                             max_features=80)

    # Sove vocabulary
    file_name = r'../data/vocab_descriptor.txt'
    transform_pipeline.VocabularyText.save_vocabulary(vocabulary, file_name)

    return None
예제 #2
0
def test_categorical_pipe():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)

    categorical_pipe = Transform.categorical_pipeline(
        categorical_attributes=None,
        categories_file=r'../data/categorical_categories.dat')

    df_clean = clean_pipe.fit_transform(dataset_raw)
    ohe_array = categorical_pipe.fit_transform(df_clean).toarray()

    # Find more about categorical pipe
    ohe = categorical_pipe.named_steps['catEncoder']
    ohe.categories  # ohe.categories_ when categories='auto'

    return ohe_array
예제 #3
0
def test_cleaning_pipe():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)

    df = clean_pipe.fit_transform(dataset_raw)

    return df
예제 #4
0
def test_get_database_features():

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                          replace_numbers=False,
                                          remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                       attributes='NAME',
                                       seperator='.',
                                       heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([('clean_pipe', clean_pipe),
                              ('text_pipe',text_pipe),
                              ])
    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 15
    sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer_id))
    database = Insert.pandas_select_execute(sel)
    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    database_features = ExtractLabels.get_database_features(database,
                                                            full_pipeline,
                                                            instance_name=customer_name)
    return database_features
예제 #5
0
def test_time():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()

    RemoveAttribute = transform_pipeline.RemoveAttribute(
        Transform.drop_attributes)
    RemoveNan = transform_pipeline.RemoveNan(Transform.nan_replace_dict)
    SetDtypes = transform_pipeline.SetDtypes(Transform.type_dict)
    TextCleaner = transform_pipeline.TextCleaner(Transform._text_clean_attrs,
                                                 replace_numbers=True)
    UnitCleaner = transform_pipeline.UnitCleaner(Transform.unit_dict)
    DuplicateRemover = transform_pipeline.DuplicateRemover(Transform.dupe_cols,
                                                           remove_dupe=True)
    VirtualRemover = transform_pipeline.VirtualRemover(remove_virtual=True)

    t0 = time.time()
    df0 = RemoveAttribute.fit_transform(dataset_raw)

    t1 = time.time()
    df1 = RemoveNan.fit_transform(df0)

    t2 = time.time()
    df2 = SetDtypes.fit_transform(df1)

    t3 = time.time()
    df3 = TextCleaner.fit_transform(df2)

    t4 = time.time()
    df4 = UnitCleaner.fit_transform(df3)

    t5 = time.time()
    indicies = DuplicateRemover.get_duplicate_indicies(df4, 'NAME')
    print('Duplicate names')
    print(df4['NAME'].iloc[indicies[:50]])
    df5 = DuplicateRemover.fit_transform(df4)

    t6 = time.time()
    df6 = VirtualRemover.fit_transform(df5)
    t7 = time.time()

    print('RemoveAttribute : {}'.format(t1 - t0))
    print('RemoveNan : {}'.format(t2 - t1))
    print('SetDtypes : {}'.format(t3 - t2))
    print('TextCleaner : {}'.format(t4 - t3))
    print('UnitCleaner : {}'.format(t5 - t4))
    print('DuplicateRemover : {}'.format(t6 - t5))
    print('VirtualRemover : {}'.format(t7 - t6))

    return None
def test_cluster_with_hyperparameters():
    """Test clustering with hyperparameters"""

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 13
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        # res = connection.execute(sel).fetchone()
        database = pd.read_sql(sel, connection)

    df_clean = clean_pipe.fit_transform(database)
    X = text_pipe.fit_transform(df_clean).toarray()
    #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary
    #df_text = pd.DataFrame(X, columns=_word_vocab)

    hyperparameters = {
        'by_size': False,
        'distance': 'euclidean',
        'clusterer': 'ward.D',
        'n_components': 8,
        'reduce': 'MDS',
        'index': 'Ratkowsky'
    }

    result = UnsupervisedCluster.cluster_with_hyperparameters(
        hyperparameters, X)

    best_nc_df = result.best_nc_dataframe

    sel = sqlalchemy.select([Customers])\
        .where(Customers.id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        res = connection.execute(sel).fetchone()
        correct_k = res.correct_k

    return result
def test_unsupervised_cluster():

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        # res = connection.execute(sel).fetchone()
        database = pd.read_sql(sel, connection)

    df_clean = clean_pipe.fit_transform(database)
    X = text_pipe.fit_transform(df_clean).toarray()
    _word_vocab = text_pipe.named_steps[
        'WordDictToSparseTransformer'].vocabulary
    df_text = pd.DataFrame(X, columns=_word_vocab)

    # Get number of clusters
    sel = sqlalchemy.select([Customers])\
        .where(Customers.id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        res = connection.execute(sel).fetchone()
        correct_k = res.correct_k

    if X.shape[0] <= 3 or correct_k == 1:
        # Dont cluster - just pass 1 cluster total
        prediction_agglo = np.ones((X.shape[0]))

    else:
        # Cluster
        agglomerative = AgglomerativeClustering(n_clusters=correct_k,
                                                affinity='euclidean',
                                                linkage='ward')
        prediction_agglo = agglomerative.fit_predict(X)

    return df_clean, prediction_agglo
예제 #8
0
def test_calc_categories_dict():
    """Generate data to find categories"""
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    sel = sqlalchemy.select([Points])
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)
    string_pipe = transform_pipeline.SetDtypes(
        type_dict={
            'TYPE': str,
            'ALARMTYPE': str,
            'FUNCTION': str,
            'VIRTUAL': str,
            'CS': str,
            'SENSORTYPE': str,
            'DEVUNITS': str
        })

    categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe),
                                      ('string_pipe', string_pipe)])

    df_clean = categories_clean_pipe.fit_transform(dataset_raw)
    """Calculate and save categories to be used later"""
    Encoding = transform_pipeline.EncodingCategories()
    columns = [
        'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE',
        'DEVUNITS'
    ]
    categories_dict = Encoding.calc_categories_dict(df_clean, columns)
    save_path = r'../data/categorical_categories.dat'

    Encoding.save_categories_to_disc(categories_dict, save_path)
    categories_dict1 = Encoding.read_categories_from_disc(save_path)
    for key in set((*categories_dict.keys(), *categories_dict1.keys())):
        assert (np.array_equal(categories_dict[key], categories_dict1[key]))

    return None
예제 #9
0
def legacy_numeric_transform_pipeline_MIL():
    
    # Transform pipeline
    TransformLegacy = transform_pipeline.Transform()
    # Legacy categorication dictionary...
    
    # Cleaning pipeline
    clean_pipe = TransformLegacy.cleaning_pipeline(drop_attributes=None,
                                                   nan_replace_dict=None,
                                                   dtype_dict=None,
                                                   unit_dict=None,
                                                   remove_dupe=True,
                                                   replace_numbers=True,
                                                   remove_virtual=True)
    # Text feature encoders
    name_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(POINTNAME_VOCABULARY_FILENAME)
    name_text_pipe = TransformLegacy.text_pipeline_label(attributes=['NAME'],
                                                  vocabulary=name_vocabulary)
    descriptor_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(DESCRIPTOR_VOCABULARY_FILENAME)
        
    descriptor_text_pipe = TransformLegacy.text_pipeline_label(attributes=['DESCRIPTOR'],
                                                         vocabulary=descriptor_vocabulary)

    # Categorical Features
    categorical_pipe = TransformLegacy.categorical_pipeline(
        categorical_attributes=None,
        handle_unknown='ignore',
        categories_file=r'../data/categorical_categories_old.dat')
    # Numeric features
    numeric_pipe = TransformLegacy.numeric_pipeline(numeric_attributes=None)
    # Union
    combined_features = FeatureUnion(transformer_list=[
        ('CategoricalPipe', categorical_pipe),
        ('NameTextPipe',name_text_pipe),
        ('DescriptorTextPipe',descriptor_text_pipe),
        ('NumericPipe',numeric_pipe),
        ])
    full_pipeline = Pipeline([
        ('CleaningPipe', clean_pipe),
        ('CombinedCategorical',combined_features),
        ])
    
    return full_pipeline
예제 #10
0
def test_read_categories():

    # Ititialize
    Transform = transform_pipeline.Transform()
    categories_file = r'../data/categorical_categories.dat'
    categories = Transform._read_categories(Transform.cat_attributes,
                                            categories_file)
    categorical_attributes = Transform.cat_attributes

    ReplaceNone = transform_pipeline.ReplaceNone(categorical_attributes)
    DataFrameSelector = transform_pipeline.DataFrameSelector(
        categorical_attributes)
    OneHotEncoder = transform_pipeline.OneHotEncoder(categories=categories)

    # Get raw database
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')
    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)
    df_clean1 = clean_pipe.fit_transform(dataset_raw)

    # Transform
    df0 = ReplaceNone.fit_transform(df_clean1)
    df1_array = DataFrameSelector.fit_transform(df0)
    ohearray = OneHotEncoder.fit_transform(df1_array).toarray()

    # Examine the transformers
    print(df0[categorical_attributes].iloc[:5])
    print(df1_array[:5])
    OneHotEncoder.categories

    return None
예제 #11
0
def test_text_pipe():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=True,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([
        ('clean_pipe', clean_pipe),
        ('text_pipe', text_pipe),
    ])

    dataset = full_pipeline.fit_transform(dataset_raw)

    return dataset
예제 #12
0
def test_full_pipeline():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')
    # group_id = 4
    group_id = 15
    sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id))
    dfraw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()

    # Cleaning pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=True,
                                             remove_virtual=True)

    # Text feature encoders
    name_file = r'../data/vocab_name.txt'
    name_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(
        name_file)
    name_text_pipe = Transform.text_pipeline_label(attributes=['NAME'],
                                                   vocabulary=name_vocabulary)
    descriptor_file = r'../data/vocab_descriptor.txt'
    descriptor_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(
        descriptor_file)
    descriptor_text_pipe = Transform.text_pipeline_label(
        attributes=['DESCRIPTOR'], vocabulary=descriptor_vocabulary)

    # Categorical Features
    categorical_pipe = Transform.categorical_pipeline(
        categorical_attributes=None,
        handle_unknown='ignore',
        categories_file=r'../data/categorical_categories.dat')

    # Numeric features
    numeric_pipe = Transform.numeric_pipeline(numeric_attributes=None)

    # Union
    combined_features = FeatureUnion(transformer_list=[
        ('CategoricalPipe', categorical_pipe),
        ('NameTextPipe', name_text_pipe),
        ('DescriptorTextPipe', descriptor_text_pipe),
        ('NumericPipe', numeric_pipe),
    ])
    full_pipeline = Pipeline([
        ('CleaningPipe', clean_pipe),
        ('CombinedFeatures', combined_features),
    ])

    combined_csr = full_pipeline.fit_transform(dfraw)
    combined_csr.shape

    CleaningPipe = full_pipeline.steps[0][1]  # CleaningPipe
    RemoveAttribute = full_pipeline.steps[0][1][0]  # RemoveAttribute
    RemoveNan = full_pipeline.steps[0][1][1]
    SetDtypes = full_pipeline.steps[0][1][2]
    TextCleaner = full_pipeline.steps[0][1][3]
    UnitCleaner = full_pipeline.steps[0][1][4]
    DuplicateRemover = full_pipeline.steps[0][1][5]
    VirtualRemover = full_pipeline.steps[0][1][6]

    df0 = RemoveAttribute.fit_transform(copy.deepcopy(dfraw))
    df1 = RemoveNan.fit_transform(copy.deepcopy(df0))
    df2 = SetDtypes.fit_transform(copy.deepcopy(df1))
    df3 = TextCleaner.fit_transform(copy.deepcopy(df2))
    df4 = UnitCleaner.fit_transform(copy.deepcopy(df3))
    df5 = DuplicateRemover.fit_transform(copy.deepcopy(df4))
    df6 = VirtualRemover.fit_transform(copy.deepcopy(df5))

    return None
예제 #13
0
def main():

    # Hyperparameters
    hyperparams = {
        'by_size':False,
        'n_components':8,
        'reduce':'MDS',
        'clusterer':'ward.D',
        'distance':'euclidean',
        'index':'all'}

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                          replace_numbers=False,
                                          remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                       attributes='NAME',
                                       seperator='.',
                                       heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Clustering class
    UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints()

    # Save hyperparameters to SQL
    # See if its already inserted
    sel = sqlalchemy.select([ClusteringHyperparameter]).where(
        sqlalchemy.sql.and_(ClusteringHyperparameter.by_size == hyperparams['by_size'],
                            ClusteringHyperparameter.clusterer == hyperparams['clusterer'],
                            ClusteringHyperparameter.distance == hyperparams['distance'],
                            ClusteringHyperparameter.reduce == hyperparams['reduce'],
                            ClusteringHyperparameter.n_components == hyperparams['n_components']))
    with Insert.engine.connect() as connection:
        res = connection.execute(sel).fetchall()

    if res.__len__():
        # Get hyperparameters id of existing hyperparameter set
        hyperparameter_id = res[0].id
    else:
        # Insert new object
        res = Insert.core_insert_instance(ClusteringHyperparameter, hyperparams)
        hyperparameter_id = res.inserted_primary_key[0]

    # Get customer list from SQL
    sel = sqlalchemy.select([Customers])
    customers = Insert.core_select_execute(sel)

    # Iterate through customers and cluster
    for customer in customers:

        # Get points from SQL
        sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer.id))
        database = Insert.pandas_select_execute(sel)
        if database.shape[0] == 0:
            print('Customer ID {} Skipped, points shape {}'.format(customer.id, database.shape[0]))
            continue
        else:
            df_clean = clean_pipe.fit_transform(database)
            X = text_pipe.fit_transform(df_clean).toarray()
            #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary
            #df_text = pd.DataFrame(X, columns=_word_vocab)

        # NbClust clustering
        print('Customer ID {}\nDB Size : {}'.format(customer_id, X_reduced.shape))
        try:
            print('Starting NbClust')
            # Perform clustering with NbClust package
            result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X)
            best_nc_df = result.best_nc_dataframe
        except RRuntimeError as e:
            if str(e).__contains__('computationally singular'):
                # The eigenvalue matrix is singular. Reduce the number of dimensions
                _hyperparams = hyperparams
                _hyperparams['n_components'] = int(_hyperparams['n_components'] / 2)
                result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X)
                best_nc_df = result.best_nc_dataframe
            else:
                print(e)
                continue

        # Build dictionary for SQL
        sel = sqlalchemy.select([Customers]).where(Customers.id.__eq__(customer.id))
        with Insert.engine.connect() as connection:
            res = connection.execute(sel).fetchone()
            correct_k = res.correct_k
        values = best_nc_df.loc['Number_clusters'].to_dict()
        values['correct_k'] = correct_k
        values['customer_id'] = customer.id
        values['hyperparameter_id'] = hyperparameter_id
        n_lens = Clustering.get_n_len_features(X)
        for key, val in n_lens.items():
            values[key] = int(val)

        # Save results to SQL
        res = Insert.core_insert_instance(Clustering, values)
        print("Inserted {}".format(res.inserted_primary_key))

    return None
예제 #14
0
    if correct_n_clusters == 1 or X.shape[0] <= 3:
        # Dont cluster if there is only one system
        # Dont cluster - just pass 1 cluster total
        prediction_agglo = np.ones((X.shape[0]))
    else:
        # Cluster
        agglomerative = AgglomerativeClustering(n_clusters=correct_n_clusters,
                                                affinity='euclidean',
                                                linkage='ward')
        prediction_agglo = agglomerative.fit_predict(X)

    return prediction_agglo


# Instantiate local classes
Transform = transform_pipeline.Transform()
# Create 'clean' data processing pipeline
clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                      replace_numbers=False,
                                      remove_virtual=True)

# Create pipeline specifically for clustering text features
text_pipe = Transform.text_pipeline(vocab_size='all',
                                   attributes='NAME',
                                   seperator='.',
                                   heirarchial_weight_word_pattern=True)

# Set up connection to SQL
Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                        driver_name='SQL Server Native Client 10.0',
                        database_name='Clustering')
def test_cluster_with_hyperparameters2():

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 13
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        # res = connection.execute(sel).fetchone()
        database = pd.read_sql(sel, connection)

    df_clean = clean_pipe.fit_transform(database)
    X = text_pipe.fit_transform(df_clean).toarray()
    #_word_vocab = text_pipe.named_steps['WordDictToS

    hyperparameters = {
        'by_size': False,
        'distance': 'euclidean',
        'clusterer': 'ward.D',
        'n_components': 8,
        'reduce': 'MDS',
        'index': 'Ratkowsky'
    }
    # Clean hyperparameters
    hyperparams = UnsupervisedCluster._parse_hyperparameter_dictionary(
        hyperparameters)

    # Perform dimensionality reduction on data
    X_dim_reduced = UnsupervisedCluster._dimensionality_reduction(
        X,
        method=hyperparams['reduce'],
        n_components=hyperparams['n_components'])

    # Conditionally call nbclust package or optimalk package
    # based on input clustering hyperparameters
    if hyperparams['index'] in UnsupervisedCluster.nbclust_indicies:
        # Cluster with nbclust and clustering algorithm
        min_nc = 3  # Static
        max_nc = UnsupervisedCluster._get_max_nc(X)  # Based on actual data

        best_nc_df = UnsupervisedCluster._nbclust_calc(
            X_dim_reduced,
            index=hyperparams['index'],
            clusterer=hyperparams['clusterer'],
            distance=hyperparams['distance'],
            min_nc=min_nc,
            max_nc=max_nc)
    # Get number of clusters
    sel = sqlalchemy.select([Customers])\
        .where(Customers.id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        res = connection.execute(sel).fetchone()
        correct_k = res.correct_k
    print(correct_k)

    pass
def save_tfrecord_sql(customer_ids, peritem_keys, label_key, reciprocal,
                      n_bins, tfrecord_writer):
    """Save TFRecord EIE format to files for ranking
    See rank_write_record.py for how Mongo database documents are
    converted to tf.train.Example objects
    Here the tf.train.Example objects are nested into the Example-in-example format
    recommended by tensorflow ranking library

    EIE Examples are of the form
     {'serialized_context':tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])),
      'serialized_examples': tf.train.Feature(bytes_list=tf.train.BytesList(value=value))}
     for 'serialized_context' value is a serialized tf.train.Example
     for 'serialized_examples' value is a list of serialized tf.train.Example
     objects that will be ranked according to their relevance to the context
     features

     Inputs
     -------
     customer_ids : (list) of customer_ids in SQL database to save
     peritem_keys : (list) of string keys that exist in peritem_features.
         Should be ['by_size','n_components','clusterer','reduce','index']
    reciprocal : (bool) Set up how I want to assign labels to objects
        Reciprocal will cause labels to be the inverse of the loss metric
        Set to True if I do not want labels to be binned
    n_bins : (int) number of bins for relevance label if reciprocal is False
    tfrecord_writer : (tf.io.TFRecordWriter) To serialized EIE TFRecord
     """
    """Create a pipeline for transforming points databases"""

    assert hasattr(customer_ids, '__iter__'), "customer_ids must be iterable"
    msg = "Each ID in customer_ids must be int type, not {}"
    for _id in customer_ids:
        assert isinstance(_id, int), msg.format(type(_id))

    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([
        ('clean_pipe', clean_pipe),
        ('text_pipe', text_pipe),
    ])

    for customer_id in customer_ids:
        print("Saving TFRecord for Customer ID : {}".format(customer_id))
        """Serialize context featuers -> serialized_context
        This is a serialized tf.train.Example object"""
        # Get Points databases related to customer_id
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        database = Insert.pandas_select_execute(sel)
        sel = sqlalchemy.select([Customers.name
                                 ]).where(Customers.id.__eq__(customer_id))
        customer_name = Insert.core_select_execute(sel)[0].name
        if database.shape[0] == 0:
            print(database.shape)
            print(customer_name)
            # Null databases should be skipped
            continue
        # Extract database featuers from Points database
        try:
            database_features = ExtractLabels.get_database_features(
                database, full_pipeline, instance_name=customer_name)
        except Labeling.PipelineError:
            print("An error occured while getting database features")
            print("Customer name : {}".format(customer_name))
            print("Customer ID : {}".format(customer_id))
            print(database)
            continue
        context_features = database_features.to_dict(orient='records')[0]
        context_features.pop('instance')
        # Create serialized TFRecord proto
        context_proto_str = serialize_context_from_dictionary(context_features)
        """Serialize peritem features. AKA examples or instances that will be ranked
        This is a list of serialized tf.train.Example objects"""
        # Get a list of Clustering primary keys related to customer_id
        sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))
        res = Insert.core_select_execute(sel)
        if len(res) == 0:
            # No clustering examples were found with the database
            print("Skipped {} No results".format(customer_name))
            continue
        primary_keys = [x.id for x in res]
        correct_k = res[0].correct_k
        # From primary keys create records. Records are used to find
        # Example features and labels
        records = get_records(primary_keys)
        # best_labels.hyperparameter_dict values are the peritem_features
        # The loss metric related to each hyperparameter_dict are labels to
        # each example
        best_labels = ExtractLabels.calc_labels(records,
                                                correct_k,
                                                error_scale=0.8,
                                                var_scale=0.2)
        example_features = []
        for label in best_labels:
            feature_dict = {}
            for key in peritem_keys:
                feature_dict[key] = label.hyperparameter_dict[key]
            feature_dict[label_key] = label.loss
            example_features.append(feature_dict)

        peritem_list = serialize_examples_from_dictionary(
            example_features,
            label_key=label_key,
            peritem_keys=peritem_keys,
            reciprocal=reciprocal,
            n_bins=n_bins,
            shuffle_peritem=shuffle_peritem)
        """Prepare serialized feature spec for EIE format"""
        serialized_dict = {
            'serialized_context': _bytes_feature([context_proto_str]),
            'serialized_examples': _bytes_feature(peritem_list)
        }

        # Convert dictionary to tf.train.Example object
        serialized_proto = tf.train.Example(features=tf.train.Features(
            feature=serialized_dict))
        serialized_str = serialized_proto.SerializeToString()

        tfrecord_writer.write(serialized_str)

    tfrecord_writer.close()

    return None