Пример #1
0
def visualize_features(file_name):
    window_size = 100
    anomalies = [500, 1000, 1500, 2000, 2500, 2800]
    processor = dataProcessor.DataProcessor(window_size, anomalies)
    features = pd.read_csv('data/{}.csv'.format(file_name)).values
    processor.visualize_features(features, file_name, 'TSNE')
    processor.visualize_features(features, file_name, 'UMAP')
Пример #2
0
def generate_features(timeseries, window_size, out_folder, name='training'):
    fn = '{}/features-{}_{}.csv'.format(out_folder, name, encoding_method)
    processor = dataProcessor.DataProcessor()

    data = processor.generate_features(
        timeseries.values, np.zeros(len(timeseries.values), dtype=int),
        window_size, fn, encoding_method, order)

    fn = '{}/{}-{}'.format(out_folder, name, encoding_method)
    processor.visualize_features(data, fn, method='TSNE')
    processor.visualize_features(data, fn, method='UMAP')
    return data
Пример #3
0
def generate_features(timeseries, window_size, name='training'):
    encoding_method = 'ARMA'
    fn = '{}/features-{}_{}.csv'.format(folder, name, encoding_method)
    processor = dataProcessor.DataProcessor()

    data = processor.generate_features(timeseries.value.values,
                                       timeseries.is_anomaly.values,
                                       window_size, fn, encoding_method)

    # data = pd.read_csv(fn)
    fn = '{}/{}-{}'.format(folder, name, encoding_method)
    processor.visualize_features(data, fn, method='TSNE')
    processor.visualize_features(data, fn, method='UMAP')
    return data
Пример #4
0
def generate_features(timeseries, window_size):
    encoding_method = 'ARMA'
    file_name_features = 'results/generated/features_{}.csv'.format(
        encoding_method)
    processor = dataProcessor.DataProcessor()

    data = processor.generate_features(timeseries.value.values,
                                       timeseries.is_anomaly.values,
                                       window_size, file_name_features,
                                       encoding_method)

    # data = pd.read_csv(file_name_features)
    fn = 'results/generated/{}'.format(encoding_method)
    processor.visualize_features(data, fn, method='TSNE')
    processor.visualize_features(data, fn, method='UMAP')
    return data
Пример #5
0
def clean_data():

    start_time = time.time()

    processer = dataProcessor.DataProcessor()

    data = pd.read_csv(data_path, header=0)

    if mode.lower() == 'clean':

        data['word_count'] = utils.count_word(data.text)

        data['count_number'] = utils.count_numbers(data.text)

        data['emojies'] = utils.view_emojie(data.text)

        data['emoticons'] = utils.view_emoticon(data.text)

        data['len_tweet'] = utils.len_tweet(data.text)

        data['avg_words_len'] = utils.avg_word_len(data.text)

        data['count_stopwords'] = utils.count_stopwords(data.text)

        data['count_tagging'] = utils.count_tagging(data.text)

        data['flagged'] = utils.repeated_char(data.text)

        # data_copy.append([word_count, count_number, emojies, len_tweet, avg_words_len, count_stopwords, count_tagging], ignore_index=True)
        data.to_csv(filename + '.csv', index=False)

        tf = utils.term_freq(data.text)

        tf.to_csv('term_frequency.csv', index=False)

        data_pro, _ = processer.proccess_data(data.text,
                                              handle_emojies=handle_emojies)

        data_pro = pd.DataFrame(data_pro, columns=['text'])
        data_pro.append(data['label'])
        data_pro.to_csv('cleaned.csv', index=False)

        elapsed_time = time.time() - start_time
        print(f'Finished in {elapsed_time}')

    return None
Пример #6
0
file_count = 1  # <= 67
data = load_files(path,
                  file_count)  # Load the combined data from the 67 csv files
# plot_data_insights(data) # Print some insights on the data
fn = '{}/yahoo_data_{}_files.png'.format(folder, file_count)
visualize(data, file_count, fn)  # Visualize / plot

# Get anomaly labels from data
window_size = 100
anomalies = data.index[data['is_anomaly'] == 1].tolist()
anomaly_windows = utils.anomalies_index_to_window_index(anomalies, window_size)

# Generate features
encoding_method = 'ARMA'
fn = '{}/yahoo-features-{}_{}.csv'.format(folder, file_count, encoding_method)
processor = dataProcessor.DataProcessor()
features = processor.generate_features(data.value.values,
                                       data.is_anomaly.values, window_size, fn,
                                       encoding_method)
# features = pd.read_csv(fn)
fn = '{}/features_{}_TSNE.png'.format(folder, encoding_method)
processor.visualize_features(features, fn, method='TSNE')

# Detect anomalies
outliers_fraction = features.is_anomaly.mean()
fn = '{}/anomalies.png'.format(folder)

detector = anomalyDetector.AnomalyDetector(outliers_fraction, window_size, fn)

features = features.drop(['is_anomaly', 'window_label'], axis=1).values
detector.detect_anomalies(features)
Пример #7
0
def visualize_labelled_features(file_name, show=True):
    features = pd.read_csv(file_name)
    processor = dataProcessor.DataProcessor()
    processor.visualize_features(features, file_name, 'TSNE', show)
    processor.visualize_features(features, file_name, 'UMAP', show)