def visualize_features(file_name): window_size = 100 anomalies = [500, 1000, 1500, 2000, 2500, 2800] processor = dataProcessor.DataProcessor(window_size, anomalies) features = pd.read_csv('data/{}.csv'.format(file_name)).values processor.visualize_features(features, file_name, 'TSNE') processor.visualize_features(features, file_name, 'UMAP')
def generate_features(timeseries, window_size, out_folder, name='training'): fn = '{}/features-{}_{}.csv'.format(out_folder, name, encoding_method) processor = dataProcessor.DataProcessor() data = processor.generate_features( timeseries.values, np.zeros(len(timeseries.values), dtype=int), window_size, fn, encoding_method, order) fn = '{}/{}-{}'.format(out_folder, name, encoding_method) processor.visualize_features(data, fn, method='TSNE') processor.visualize_features(data, fn, method='UMAP') return data
def generate_features(timeseries, window_size, name='training'): encoding_method = 'ARMA' fn = '{}/features-{}_{}.csv'.format(folder, name, encoding_method) processor = dataProcessor.DataProcessor() data = processor.generate_features(timeseries.value.values, timeseries.is_anomaly.values, window_size, fn, encoding_method) # data = pd.read_csv(fn) fn = '{}/{}-{}'.format(folder, name, encoding_method) processor.visualize_features(data, fn, method='TSNE') processor.visualize_features(data, fn, method='UMAP') return data
def generate_features(timeseries, window_size): encoding_method = 'ARMA' file_name_features = 'results/generated/features_{}.csv'.format( encoding_method) processor = dataProcessor.DataProcessor() data = processor.generate_features(timeseries.value.values, timeseries.is_anomaly.values, window_size, file_name_features, encoding_method) # data = pd.read_csv(file_name_features) fn = 'results/generated/{}'.format(encoding_method) processor.visualize_features(data, fn, method='TSNE') processor.visualize_features(data, fn, method='UMAP') return data
def clean_data(): start_time = time.time() processer = dataProcessor.DataProcessor() data = pd.read_csv(data_path, header=0) if mode.lower() == 'clean': data['word_count'] = utils.count_word(data.text) data['count_number'] = utils.count_numbers(data.text) data['emojies'] = utils.view_emojie(data.text) data['emoticons'] = utils.view_emoticon(data.text) data['len_tweet'] = utils.len_tweet(data.text) data['avg_words_len'] = utils.avg_word_len(data.text) data['count_stopwords'] = utils.count_stopwords(data.text) data['count_tagging'] = utils.count_tagging(data.text) data['flagged'] = utils.repeated_char(data.text) # data_copy.append([word_count, count_number, emojies, len_tweet, avg_words_len, count_stopwords, count_tagging], ignore_index=True) data.to_csv(filename + '.csv', index=False) tf = utils.term_freq(data.text) tf.to_csv('term_frequency.csv', index=False) data_pro, _ = processer.proccess_data(data.text, handle_emojies=handle_emojies) data_pro = pd.DataFrame(data_pro, columns=['text']) data_pro.append(data['label']) data_pro.to_csv('cleaned.csv', index=False) elapsed_time = time.time() - start_time print(f'Finished in {elapsed_time}') return None
file_count = 1 # <= 67 data = load_files(path, file_count) # Load the combined data from the 67 csv files # plot_data_insights(data) # Print some insights on the data fn = '{}/yahoo_data_{}_files.png'.format(folder, file_count) visualize(data, file_count, fn) # Visualize / plot # Get anomaly labels from data window_size = 100 anomalies = data.index[data['is_anomaly'] == 1].tolist() anomaly_windows = utils.anomalies_index_to_window_index(anomalies, window_size) # Generate features encoding_method = 'ARMA' fn = '{}/yahoo-features-{}_{}.csv'.format(folder, file_count, encoding_method) processor = dataProcessor.DataProcessor() features = processor.generate_features(data.value.values, data.is_anomaly.values, window_size, fn, encoding_method) # features = pd.read_csv(fn) fn = '{}/features_{}_TSNE.png'.format(folder, encoding_method) processor.visualize_features(features, fn, method='TSNE') # Detect anomalies outliers_fraction = features.is_anomaly.mean() fn = '{}/anomalies.png'.format(folder) detector = anomalyDetector.AnomalyDetector(outliers_fraction, window_size, fn) features = features.drop(['is_anomaly', 'window_label'], axis=1).values detector.detect_anomalies(features)
def visualize_labelled_features(file_name, show=True): features = pd.read_csv(file_name) processor = dataProcessor.DataProcessor() processor.visualize_features(features, file_name, 'TSNE', show) processor.visualize_features(features, file_name, 'UMAP', show)