def test_numeric_pipe(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) numeric_pipe = Transform.numeric_pipeline( numeric_attributes=NUM_ATTRIBUTES) df_clean = clean_pipe.fit_transform(dataset_raw) df_numeric = numeric_pipe.fit_transform(df_clean) return None
def test_categorical_pipe(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) categorical_pipe = Transform.categorical_pipeline( categorical_attributes=CATEGORICAL_ATTRIBUTES, handle_unknown='ignore', categories_file=CATEGORIES_FILE) df_clean = clean_pipe.fit_transform(dataset_raw) ohe_array = categorical_pipe.fit_transform(df_clean).toarray() print("Example OneHotEcoded Array: ", ohe_array[0]) # Find more about categorical pipe ohe = categorical_pipe.named_steps['OneHotEncoder'] print("Categories used for OneHotEncoder", ohe.categories) return None
def get_train_test_id_sql(train_pct=0.8): """Returns primary keys of all unique customers The complete set of customer_ids are split into training and testing sets inputs ------- train_pct: (float) percent of docuemnt _ids to be considered for training outputs_ ------- (train_ids, test_ids): (list) of training and testing _ids """ # Set up connection to SQL Insert = extract.Insert(server_name=server_name, driver_name=driver_name, database_name=database_name) # Query SQL for all customer primary keys sel = sqlalchemy.select([Customers.id]) customer_ids = Insert.core_select_execute(sel) customer_ids = [x.id for x in customer_ids] # Permute all primary keys into training and testing sets index = np.arange(len(customer_ids)) np.random.shuffle(index) n_train = int(len(customer_ids) * train_pct) train_index = index[n_train:] text_index = index[:n_train] train_ids = [customer_ids[idx] for idx in train_index] test_ids = [customer_ids[idx] for idx in text_index] return train_ids, test_ids
def test_text_pipe(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) # Create pipeline specifically for clustering text features name_vocabulary = VocabularyText.read_vocabulary_disc( POINTNAME_VOCABULARY_FILENAME) name_text_pipe = Transform.text_pipeline_label( attributes=['NAME'], vocabulary=name_vocabulary) full_pipeline = Pipeline([ ('clean_pipe', clean_pipe), ('text_pipe', name_text_pipe), ]) dataset = full_pipeline.fit_transform(dataset_raw) return None
def test_get_building_suffix(): """Test whether a set of points is a building suffix word""" Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(18)) dataset_raw = Insert.pandas_select_execute(sel) # Split name variable token_pattern = r'\.' tokenizer = re.compile(token_pattern) # Keep track of words words = [] # Split each name into tokens for idx, word in dataset_raw['NAME'].iteritems(): parts = tokenizer.split(word) words.append(parts) # Get vocabulary VocabularyText = transform_pipeline.VocabularyText() suffix = VocabularyText.get_building_suffix(words) print("Suffix found : ", suffix) return None
def test_get_text_vocabulary(): """Generate data to find Vocabulary""" Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) df_clean = clean_pipe.fit_transform(dataset_raw) # Get vocabulary for DESCRIPTOR feature - a text feature VocabularyText = transform_pipeline.VocabularyText() vocabulary = VocabularyText\ .get_text_vocabulary(X=df_clean, col_name='DESCRIPTOR', remove_suffix=False, max_features=80) # Sove vocabulary file_name = r'../data/vocab_descriptor.txt' transform_pipeline.VocabularyText.save_vocabulary(vocabulary, file_name) return None
def test_cleaning_pipe(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) df = clean_pipe.fit_transform(dataset_raw) return df
def test_get_database_features(): # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) full_pipeline = Pipeline([('clean_pipe', clean_pipe), ('text_pipe',text_pipe), ]) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 15 sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer_id)) database = Insert.pandas_select_execute(sel) sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name database_features = ExtractLabels.get_database_features(database, full_pipeline, instance_name=customer_name) return database_features
def test_read_categories(self): # Ititialize categories = Transform._read_categories(CATEGORICAL_ATTRIBUTES, CATEGORIES_FILE) replaceNone = ReplaceNone(CATEGORICAL_ATTRIBUTES) dataFrameSelector = DataFrameSelector(CATEGORICAL_ATTRIBUTES) oneHotEncoder = OneHotEncoder(categories=categories, handle_unknown='ignore') # Get raw database Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) df_clean1 = clean_pipe.fit_transform(dataset_raw) # Transform df0 = replaceNone.fit_transform(df_clean1) df1_array = dataFrameSelector.fit_transform(df0) ohearray = oneHotEncoder.fit_transform(df1_array).toarray() return None
def test_categorical_pipe(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) categorical_pipe = Transform.categorical_pipeline( categorical_attributes=None, categories_file=r'../data/categorical_categories.dat') df_clean = clean_pipe.fit_transform(dataset_raw) ohe_array = categorical_pipe.fit_transform(df_clean).toarray() # Find more about categorical pipe ohe = categorical_pipe.named_steps['catEncoder'] ohe.categories # ohe.categories_ when categories='auto' return ohe_array
def test_time(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() RemoveAttribute = transform_pipeline.RemoveAttribute( Transform.drop_attributes) RemoveNan = transform_pipeline.RemoveNan(Transform.nan_replace_dict) SetDtypes = transform_pipeline.SetDtypes(Transform.type_dict) TextCleaner = transform_pipeline.TextCleaner(Transform._text_clean_attrs, replace_numbers=True) UnitCleaner = transform_pipeline.UnitCleaner(Transform.unit_dict) DuplicateRemover = transform_pipeline.DuplicateRemover(Transform.dupe_cols, remove_dupe=True) VirtualRemover = transform_pipeline.VirtualRemover(remove_virtual=True) t0 = time.time() df0 = RemoveAttribute.fit_transform(dataset_raw) t1 = time.time() df1 = RemoveNan.fit_transform(df0) t2 = time.time() df2 = SetDtypes.fit_transform(df1) t3 = time.time() df3 = TextCleaner.fit_transform(df2) t4 = time.time() df4 = UnitCleaner.fit_transform(df3) t5 = time.time() indicies = DuplicateRemover.get_duplicate_indicies(df4, 'NAME') print('Duplicate names') print(df4['NAME'].iloc[indicies[:50]]) df5 = DuplicateRemover.fit_transform(df4) t6 = time.time() df6 = VirtualRemover.fit_transform(df5) t7 = time.time() print('RemoveAttribute : {}'.format(t1 - t0)) print('RemoveNan : {}'.format(t2 - t1)) print('SetDtypes : {}'.format(t3 - t2)) print('TextCleaner : {}'.format(t4 - t3)) print('UnitCleaner : {}'.format(t5 - t4)) print('DuplicateRemover : {}'.format(t6 - t5)) print('VirtualRemover : {}'.format(t7 - t6)) return None
def test_calc_categories_dict(self): # Generate data to find categories Insert = extract.Insert(server_name, driver_name, database_name) sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) string_pipe = SetDtypes( type_dict={ 'TYPE': str, 'ALARMTYPE': str, 'FUNCTION': str, 'VIRTUAL': str, 'CS': str, 'SENSORTYPE': str, 'DEVUNITS': str }) categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe), ('string_pipe', string_pipe)]) df_clean = categories_clean_pipe.fit_transform(dataset_raw) # Calculate categories to be used later Encoding = EncodingCategories() columns = [ 'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE', 'DEVUNITS' ] categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns) if not os.path.exists(CATEGORIES_FILE): raise OSError("Categories file not found: {}".\ format(CATEGORIES_FILE)) # Compare categoires read to those saved on disc categories_dict_read = Encoding.read_categories_from_disc( CATEGORIES_FILE) for key in set( (*categories_dict_calc.keys(), *categories_dict_read.keys())): self.assertEqual(set(categories_dict_calc[key]), set(categories_dict_read[key])) return None
def test_cluster_with_hyperparameters(): """Test clustering with hyperparameters""" # Instantiate local classes Transform = transform_pipeline.Transform() UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 13 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) with Insert.engine.begin() as connection: # res = connection.execute(sel).fetchone() database = pd.read_sql(sel, connection) df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary #df_text = pd.DataFrame(X, columns=_word_vocab) hyperparameters = { 'by_size': False, 'distance': 'euclidean', 'clusterer': 'ward.D', 'n_components': 8, 'reduce': 'MDS', 'index': 'Ratkowsky' } result = UnsupervisedCluster.cluster_with_hyperparameters( hyperparameters, X) best_nc_df = result.best_nc_dataframe sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k return result
def test_serialize_example_in_example(self): # Requires MSSQL server, data in 'Clustering' database, # configuration file, and other stuff config = configparser.ConfigParser() config.read(r'../extract/sql_config.ini') server_name = config['sql_server']['DEFAULT_SQL_SERVER_NAME'] driver_name = config['sql_server']['DEFAULT_SQL_DRIVER_NAME'] database_name = config['sql_server']['DEFAULT_DATABASE_NAME'] Insert = extract.Insert(server_name=server_name, driver_name=driver_name, database_name=database_name) # Load an example from SQL database customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) database = Insert.pandas_select_execute(sel) sel = sqlalchemy.select([Customers.name ]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name # Transformation pipeline full_pipeline = Transform.get_ranking_pipeline() # Dictionary with keys ['n_instance', 'n_features', 'len_var', 'uniq_ratio', # 'n_len1', 'n_len2', 'n_len3', 'n_len4', 'n_len5', # 'n_len6', 'n_len7'] database_features = get_database_features(database, full_pipeline, instance_name=customer_name) database_features.pop('instance') #1. Context features (bytes object) serialized_context = serialize_context_from_dictionary( database_features) #2. Peritem features (bytes object) serialized_peritem = serialize_examples_model4( HYPERPARAMETER_LIST, list_size=_LIST_SIZE_MODEL4) # Prepare serialized feature spec for EIE format serialized_dict = { 'serialized_context': _bytes_feature([serialized_context]), 'serialized_examples': _bytes_feature(serialized_peritem) } # Convert to tf.train.Example object serialized_proto = tf.train.Example(features=tf.train.Features( feature=serialized_dict)) serialized_example_in_example = serialized_proto.SerializeToString() return serialized_example_in_example
def test_unsupervised_cluster(): # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) with Insert.engine.begin() as connection: # res = connection.execute(sel).fetchone() database = pd.read_sql(sel, connection) df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() _word_vocab = text_pipe.named_steps[ 'WordDictToSparseTransformer'].vocabulary df_text = pd.DataFrame(X, columns=_word_vocab) # Get number of clusters sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k if X.shape[0] <= 3 or correct_k == 1: # Dont cluster - just pass 1 cluster total prediction_agglo = np.ones((X.shape[0])) else: # Cluster agglomerative = AgglomerativeClustering(n_clusters=correct_k, affinity='euclidean', linkage='ward') prediction_agglo = agglomerative.fit_predict(X) return df_clean, prediction_agglo
def test_timeself(self): Insert = extract.Insert(server_name, driver_name, database_name) customer_id = 15 sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) removeAttribute = RemoveAttribute(DROP_ATTRIBUTES) removeNan = RemoveNan(NAN_REPLACE_DICT) setDtypes = SetDtypes(TYPE_DICT) textCleaner = TextCleaner(TEXT_CLEAN_ATTRS, replace_numbers=True) unitCleaner = UnitCleaner(UNIT_DICT) duplicateRemover = DuplicateRemover(DUPE_COLS, remove_dupe=True) virtualRemover = VirtualRemover(remove_virtual=True) t0 = time.time() df0 = removeAttribute.fit_transform(dataset_raw) t1 = time.time() df1 = removeNan.fit_transform(df0) t2 = time.time() df2 = setDtypes.fit_transform(df1) t3 = time.time() df3 = textCleaner.fit_transform(df2) t4 = time.time() df4 = unitCleaner.fit_transform(df3) t5 = time.time() indicies = duplicateRemover.get_duplicate_indicies(df4, 'NAME') print('Duplicate names') print(df4['NAME'].iloc[indicies[:50]]) df5 = duplicateRemover.fit_transform(df4) t6 = time.time() virtualRemover.fit_transform(df5) t7 = time.time() print('RemoveAttribute : {}'.format(t1 - t0)) print('RemoveNan : {}'.format(t2 - t1)) print('SetDtypes : {}'.format(t3 - t2)) print('TextCleaner : {}'.format(t4 - t3)) print('UnitCleaner : {}'.format(t5 - t4)) print('DuplicateRemover : {}'.format(t6 - t5)) print('VirtualRemover : {}'.format(t7 - t6)) return None
def test_serialize_examples_from_dictionary(): """This module has (3) methods of serializing peritem """ """Set up how I want to assign labels to objects Reciprocal will cause labels to be the inverse of the loss metric Set to True if I do not want labels to be binned""" reciprocal = False # Reciprocal of relevance label - use if you dont bin labels n_bins = 5 # number of bins for relevance label label_key = 'relevance' # These are peritem featuer columns names peritem_keys = ['by_size','n_components','clusterer','reduce','index'] # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get all records relating to one customer customer_id = 15 sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] correct_k = res[0].correct_k sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name # Calculate ranking of all records records = get_records(primary_keys) best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) example_features = [] for label in best_labels: feature_dict = {} for key in peritem_keys: feature_dict[key] = label.hyperparameter_dict[key] feature_dict[label_key] = label.loss example_features.append(feature_dict) serialized_example = serialize_examples_from_dictionary(example_features, label_key, peritem_keys, reciprocal=reciprocal, n_bins=n_bins, shuffle_peritem=True) return serialized_example
def test_calc_categories_dict(): """Generate data to find categories""" Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) string_pipe = transform_pipeline.SetDtypes( type_dict={ 'TYPE': str, 'ALARMTYPE': str, 'FUNCTION': str, 'VIRTUAL': str, 'CS': str, 'SENSORTYPE': str, 'DEVUNITS': str }) categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe), ('string_pipe', string_pipe)]) df_clean = categories_clean_pipe.fit_transform(dataset_raw) """Calculate and save categories to be used later""" Encoding = transform_pipeline.EncodingCategories() columns = [ 'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE', 'DEVUNITS' ] categories_dict = Encoding.calc_categories_dict(df_clean, columns) save_path = r'../data/categorical_categories.dat' Encoding.save_categories_to_disc(categories_dict, save_path) categories_dict1 = Encoding.read_categories_from_disc(save_path) for key in set((*categories_dict.keys(), *categories_dict1.keys())): assert (np.array_equal(categories_dict[key], categories_dict1[key])) return None
def test_legacy_numeric_transform_pipeline_MIL(self): # Get some raw data Insert = extract.Insert(server_name, driver_name, database_name) group_id = 15 sel = sqlalchemy.select([Points ]).where(Points.group_id.__eq__(group_id)) dfraw = Insert.pandas_select_execute(sel) # Get the legacy pipeline full_pipeline = legacy_numeric_transform_pipeline_MIL() # Transform data bag = full_pipeline.fit_transform(dfraw) # Observe output number of attributes, should be 3236 for compatability self.assertEquals(bag.shape[1], 3236) return None
def test_read_categories(): # Ititialize Transform = transform_pipeline.Transform() categories_file = r'../data/categorical_categories.dat' categories = Transform._read_categories(Transform.cat_attributes, categories_file) categorical_attributes = Transform.cat_attributes ReplaceNone = transform_pipeline.ReplaceNone(categorical_attributes) DataFrameSelector = transform_pipeline.DataFrameSelector( categorical_attributes) OneHotEncoder = transform_pipeline.OneHotEncoder(categories=categories) # Get raw database Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) df_clean1 = clean_pipe.fit_transform(dataset_raw) # Transform df0 = ReplaceNone.fit_transform(df_clean1) df1_array = DataFrameSelector.fit_transform(df0) ohearray = OneHotEncoder.fit_transform(df1_array).toarray() # Examine the transformers print(df0[categorical_attributes].iloc[:5]) print(df1_array[:5]) OneHotEncoder.categories return None
def test_get_database_labels(): # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get all records relating to one customer customer_id = 15 sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] correct_k = res[0].correct_k sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name # Calculate ranking of all records records = get_records(primary_keys) best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) return best_labels
def test_text_pipe(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) full_pipeline = Pipeline([ ('clean_pipe', clean_pipe), ('text_pipe', text_pipe), ]) dataset = full_pipeline.fit_transform(dataset_raw) return dataset
def test_full_pipeline(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # group_id = 4 group_id = 15 sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id)) dfraw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Cleaning pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) # Text feature encoders name_file = r'../data/vocab_name.txt' name_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc( name_file) name_text_pipe = Transform.text_pipeline_label(attributes=['NAME'], vocabulary=name_vocabulary) descriptor_file = r'../data/vocab_descriptor.txt' descriptor_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc( descriptor_file) descriptor_text_pipe = Transform.text_pipeline_label( attributes=['DESCRIPTOR'], vocabulary=descriptor_vocabulary) # Categorical Features categorical_pipe = Transform.categorical_pipeline( categorical_attributes=None, handle_unknown='ignore', categories_file=r'../data/categorical_categories.dat') # Numeric features numeric_pipe = Transform.numeric_pipeline(numeric_attributes=None) # Union combined_features = FeatureUnion(transformer_list=[ ('CategoricalPipe', categorical_pipe), ('NameTextPipe', name_text_pipe), ('DescriptorTextPipe', descriptor_text_pipe), ('NumericPipe', numeric_pipe), ]) full_pipeline = Pipeline([ ('CleaningPipe', clean_pipe), ('CombinedFeatures', combined_features), ]) combined_csr = full_pipeline.fit_transform(dfraw) combined_csr.shape CleaningPipe = full_pipeline.steps[0][1] # CleaningPipe RemoveAttribute = full_pipeline.steps[0][1][0] # RemoveAttribute RemoveNan = full_pipeline.steps[0][1][1] SetDtypes = full_pipeline.steps[0][1][2] TextCleaner = full_pipeline.steps[0][1][3] UnitCleaner = full_pipeline.steps[0][1][4] DuplicateRemover = full_pipeline.steps[0][1][5] VirtualRemover = full_pipeline.steps[0][1][6] df0 = RemoveAttribute.fit_transform(copy.deepcopy(dfraw)) df1 = RemoveNan.fit_transform(copy.deepcopy(df0)) df2 = SetDtypes.fit_transform(copy.deepcopy(df1)) df3 = TextCleaner.fit_transform(copy.deepcopy(df2)) df4 = UnitCleaner.fit_transform(copy.deepcopy(df3)) df5 = DuplicateRemover.fit_transform(copy.deepcopy(df4)) df6 = VirtualRemover.fit_transform(copy.deepcopy(df5)) return None
def calc_save_categories_vocabulary(): # Read raw data from database Insert = extract.Insert(server_name, driver_name, database_name) sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline( drop_attributes=DROP_ATTRIBUTES, nan_replace_dict=NAN_REPLACE_DICT, dtype_dict=TYPE_DICT, unit_dict=UNIT_DICT, dupe_cols=DUPE_COLS, remove_dupe=REMOVE_DUPE, replace_numbers=REPLACE_NUMBERS, remove_virtual=REMOVE_VIRTUAL, text_clean_attributes=TEXT_CLEAN_ATTRS) string_pipe = SetDtypes( type_dict={ 'TYPE': str, 'ALARMTYPE': str, 'FUNCTION': str, 'VIRTUAL': str, 'CS': str, 'SENSORTYPE': str, 'DEVUNITS': str }) categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe), ('string_pipe', string_pipe)]) # Process raw data with pipeline df_clean = categories_clean_pipe.fit_transform(dataset_raw) # Calculate categories to be used later Encoding = EncodingCategories() columns = [ 'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE', 'DEVUNITS' ] categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns) # Save categories in numpy array to be used later Encoding.save_categories_to_disc(categories_dict_calc, CATEGORIES_FILE) # Save vocabulary to file VOCAB_ALARMTYPE_PATH = '../data/vocab_alarmtype.txt' save_numpy_string_array_to_text(categories_dict_calc['ALARMTYPE'], VOCAB_ALARMTYPE_PATH) VOCAB_CS_PATH = '../data/vocab_cs.txt' save_numpy_string_array_to_text(categories_dict_calc['CS'], VOCAB_CS_PATH) VOCAB_DEVUNITS_PATH = '../data/vocab_devunits.txt' save_numpy_string_array_to_text(categories_dict_calc['DEVUNITS'], VOCAB_DEVUNITS_PATH) VOCAB_FUNCTION_PATH = '../data/vocab_function.txt' save_numpy_string_array_to_text(categories_dict_calc['FUNCTION'], VOCAB_FUNCTION_PATH) VOCAB_SENSORTYPE_PATH = '../data/vocab_sensortype.txt' save_numpy_string_array_to_text(categories_dict_calc['SENSORTYPE'], VOCAB_SENSORTYPE_PATH) VOCAB_TYPE_PATH = '../data/vocab_type.txt' save_numpy_string_array_to_text(categories_dict_calc['TYPE'], VOCAB_TYPE_PATH) VOCAB_VIRTUAL_PATH = '../data/vocab_virtual.txt' save_numpy_string_array_to_text(categories_dict_calc['VIRTUAL'], VOCAB_VIRTUAL_PATH) return None
def main(): # Hyperparameters hyperparams = { 'by_size':False, 'n_components':8, 'reduce':'MDS', 'clusterer':'ward.D', 'distance':'euclidean', 'index':'all'} # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Clustering class UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints() # Save hyperparameters to SQL # See if its already inserted sel = sqlalchemy.select([ClusteringHyperparameter]).where( sqlalchemy.sql.and_(ClusteringHyperparameter.by_size == hyperparams['by_size'], ClusteringHyperparameter.clusterer == hyperparams['clusterer'], ClusteringHyperparameter.distance == hyperparams['distance'], ClusteringHyperparameter.reduce == hyperparams['reduce'], ClusteringHyperparameter.n_components == hyperparams['n_components'])) with Insert.engine.connect() as connection: res = connection.execute(sel).fetchall() if res.__len__(): # Get hyperparameters id of existing hyperparameter set hyperparameter_id = res[0].id else: # Insert new object res = Insert.core_insert_instance(ClusteringHyperparameter, hyperparams) hyperparameter_id = res.inserted_primary_key[0] # Get customer list from SQL sel = sqlalchemy.select([Customers]) customers = Insert.core_select_execute(sel) # Iterate through customers and cluster for customer in customers: # Get points from SQL sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer.id)) database = Insert.pandas_select_execute(sel) if database.shape[0] == 0: print('Customer ID {} Skipped, points shape {}'.format(customer.id, database.shape[0])) continue else: df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary #df_text = pd.DataFrame(X, columns=_word_vocab) # NbClust clustering print('Customer ID {}\nDB Size : {}'.format(customer_id, X_reduced.shape)) try: print('Starting NbClust') # Perform clustering with NbClust package result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X) best_nc_df = result.best_nc_dataframe except RRuntimeError as e: if str(e).__contains__('computationally singular'): # The eigenvalue matrix is singular. Reduce the number of dimensions _hyperparams = hyperparams _hyperparams['n_components'] = int(_hyperparams['n_components'] / 2) result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X) best_nc_df = result.best_nc_dataframe else: print(e) continue # Build dictionary for SQL sel = sqlalchemy.select([Customers]).where(Customers.id.__eq__(customer.id)) with Insert.engine.connect() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k values = best_nc_df.loc['Number_clusters'].to_dict() values['correct_k'] = correct_k values['customer_id'] = customer.id values['hyperparameter_id'] = hyperparameter_id n_lens = Clustering.get_n_len_features(X) for key, val in n_lens.items(): values[key] = int(val) # Save results to SQL res = Insert.core_insert_instance(Clustering, values) print("Inserted {}".format(res.inserted_primary_key)) return None
from extract import extract from extract.SQLAlchemyDataDefinition import (Customers, Points, Netdev, ClusteringHyperparameter, Clustering, Labeling) # Local declarations config = configparser.ConfigParser() config.read(r'../extract/sql_config.ini') server_name = config['sql_server']['DEFAULT_SQL_SERVER_NAME'] driver_name = config['sql_server']['DEFAULT_SQL_DRIVER_NAME'] database_name = config['sql_server']['DEFAULT_DATABASE_NAME'] Extract = extract.Extract() Insert = extract.Insert(server_name, driver_name, database_name) #%% """ Save databases from server to local machine """ def main_copy_to_local(): search_directory = r"R:\JOBS" save_directory = r"D:\Z - Saved SQL Databases" Extract.search_and_save(search_directory, save_directory) return None
def __init__(self, server_name, driver_name, database_name): self.Insert = extract.Insert(server_name=server_name, driver_name=driver_name, database_name=database_name) return None
def test_cluster_with_hyperparameters2(): # Instantiate local classes Transform = transform_pipeline.Transform() UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 13 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) with Insert.engine.begin() as connection: # res = connection.execute(sel).fetchone() database = pd.read_sql(sel, connection) df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() #_word_vocab = text_pipe.named_steps['WordDictToS hyperparameters = { 'by_size': False, 'distance': 'euclidean', 'clusterer': 'ward.D', 'n_components': 8, 'reduce': 'MDS', 'index': 'Ratkowsky' } # Clean hyperparameters hyperparams = UnsupervisedCluster._parse_hyperparameter_dictionary( hyperparameters) # Perform dimensionality reduction on data X_dim_reduced = UnsupervisedCluster._dimensionality_reduction( X, method=hyperparams['reduce'], n_components=hyperparams['n_components']) # Conditionally call nbclust package or optimalk package # based on input clustering hyperparameters if hyperparams['index'] in UnsupervisedCluster.nbclust_indicies: # Cluster with nbclust and clustering algorithm min_nc = 3 # Static max_nc = UnsupervisedCluster._get_max_nc(X) # Based on actual data best_nc_df = UnsupervisedCluster._nbclust_calc( X_dim_reduced, index=hyperparams['index'], clusterer=hyperparams['clusterer'], distance=hyperparams['distance'], min_nc=min_nc, max_nc=max_nc) # Get number of clusters sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k print(correct_k) pass
# Remove the drive letter on windows _CWD = os.path.splitdrive(os.getcwd())[1] _PARTS = _CWD.split(os.sep) # Project dir is one level above cwd _PROJECT_DIR = os.path.join(os.sep, *_PARTS[:-1]) if _PROJECT_DIR not in sys.path: sys.path.insert(0, _PROJECT_DIR) from extract import extract from extract.SQLAlchemyDataDefinition import (Clustering, Points, Netdev, Customers, ClusteringHyperparameter, Labeling) Insert = extract.Insert(server_name='.\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') #%% class Record(): """Keep track of individual dataframes and their related information parameters ------- dataframe : a dataframe containing error metric information. See import_error_dfs() parent_file : original csv file hyper_dict : a dictionary on the predicted sets hyperparameters. For example {'hyper1':value1, [...]}""" def __init__(self, indicies_dictionary, hyperparameter_dictionary):
def get_hyperparameters_serving(): """The ranking model imputs a tensor of context features and per-item features The per-item features are clusterering hyperparameters turned to indicator columns. In order to predict on a new database, you must input the per-item clustering hyperparameters into the model. In training, I have been doing this with actual recorded hyperparameters For prediction I must generate the clustering hyperparameters. These must be known before this module will generate an array of clustering hyperparameters like: [['False', 'kmeans', '8', 'TSNE', 'optk_TSNE_gap*_max'], ['True', 'ward.D', '8', 'MDS', 'SDbw'], [...]] This can be fed to tf.feature_columns or TFRecords in order to generate inputs to a ranking model for prediction """ # Instantiate a class for reading SQL data Insert = extract.Insert(server_name, driver_name, database_name) """Get most frequent hyperparameter occurences for each customer Customer IDs are used to retrieve clustering results for each customer""" sel = sqlalchemy.select([Customers.id]) customer_ids = Insert.core_select_execute(sel) # Keep track of the best clustering hyperparameters for all datasets all_labels = [] for _id in customer_ids: customer_id = _id.id """Get primary key of clusterings related to customer Each primary key is used to create Record objects with get_records""" sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) primary_keys = [x.id for x in res] # Create records for feeding while calculating the best labels records = get_records(primary_keys) if records.__len__() <= 1: # Not enough examples to append continue sel = sqlalchemy.select([Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id))\ .limit(1) res = Insert.core_select_execute(sel) correct_k = res[0].correct_k """best_labels is a list of namedtuple objects each tuple has a name hyperparameter_dict which contains hyperparameters used to cluster that customers database A unique list of clustering hyperparameters will be used for model serving""" best_labels = ClusteringLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) """Keep the 10 best best_lables for each customer_id The idea is we should predict between some of the best available hyperparameters for ranking model""" if best_labels.__len__() > 10: for i in range(0,10): all_labels.append(best_labels[i]) else: n = int(best_labels.__len__() * 0.5) for i in range(n): all_labels.append(best_labels[i]) """Each hyperparameter_dict in all_labels is not unique To create a unique set of dictionary values use the frozenset object The frozenset is hashable (unlike normal set) which means it can be used in Counter objects""" hyperparams = [] for x in all_labels: y = x.hyperparameter_dict # Dictionary hyperparams_set = frozenset(y.values()) hyperparams.append(hyperparams_set) # Counter objects create a set from hyperparams c = Counter(hyperparams) c.most_common() """Convert to dictionary and save in list Convert hyperparameter frozenset back to a nomral dictionary""" hyperparameters_serving = [] for x in c.keys(): hyperparameter_dict = ClusteringLabels._hyperparameter_set_2_dict(x) hyperparameters_serving.append(hyperparameter_dict) return hyperparameters_serving