def main():
    if not os.path.isdir(DATASETS):
        raise FileNotFoundError('Folder given to save datasets does not exist.')
    datasets = sys.argv[1:]  # use datasets given by user
    for month in set(datasets):
        filename = download_data(month)
        out_path = os.path.join(DATASETS, filename)
        preprocess_data(filename, out_path)
    print(f'Successfully downloaded and preproccessed {len(set(datasets))} file(s).')
示例#2
0
def generate_clean_data():
    train_data, test_data = get_raw_data.classify_data(ORIGINAL_FILE_PATH)
    print("The data is being processed right now. Please wait....")
    train_data = pre.preprocess_data(train_data)
    train_data.to_csv("./data/train.csv",
                      sep=",",
                      encoding="utf-8",
                      index=False)
    test_data = pre.preprocess_data(test_data)
    test_data.to_csv("./data/test.csv", sep=",", encoding="utf-8", index=False)
    use_classifier()
def run_mr(use_word_to_vec=True):
    data, labels = data_importer.load_data_and_labels_mr()
    word_to_index_mapping, vectors = w2v.load_mr_word_vectors()
    train_data, train_labels, vocabulary, test_data, test_labels, embedding_vectors = \
        pre.preprocess_data(data, labels, word_to_index_mapping, vectors)
    if not use_word_to_vec:
        embedding_vectors = None
    t.train(train_data, train_labels, vocabulary, test_data, test_labels, embedding_vectors)
示例#4
0
 def test_compressing_without_output(self):
     """Test wheather compressed data would be the same after decompression if `output` parameter is not
     passed"""
     new_path = os.path.join(os.path.dirname(path),
                             '_temp_' + os.path.basename(path))
     shutil.copyfile(
         path, new_path)  # copy file in order not to modify original file
     file_path = preprocess_data(new_path)
     self._test_compressing_helper(file_path)
示例#5
0
def quick_explore(folder='folder_name'):

    # Imports
    import pandas as pd
    import matplotlib.pyplot as plt
    import os
    from preprocessor import preprocess_data

    directory = '../../data/' + folder + '/'

    for filename in os.listdir(directory):

        filepath = os.path.join(directory, filename)

        # Read CSV files
        if filename.endswith('.csv'):
            raw_df = pd.read_csv(filepath, infer_datetime_format=True)
            neg_index = 4

        # Read Excel files
        elif filename.endswith('.xlsx'):
            raw_df = pd.read_excel(filepath, infer_datetime_format=True)
            neg_index = 5

        # Skip other file types
        else:
            continue

        print()
        print('--------------' * 8)
        print('--------------' * 8)
        print(filename[:-neg_index].upper())
        print()

        df, redund_dict = preprocess_data(raw_df)
    return
示例#6
0
 def test_compressing_with_output(self):
     """Test wheather compressed data would be the same after decompression if `output` parameter is passed
     """
     file_path = preprocess_data(path, '_temp', delete_original=False)
     self._test_compressing_helper(file_path)
示例#7
0
import os
os.chdir(os.path.dirname(os.path.abspath(__file__)))
#os.chdir('local\\directory')

import scraper
import preprocessor
import loader

data = scraper.scrape_data()
data = preprocessor.preprocess_data(data)
loader.gsheets_upload(data)

### saving local copy of data
#data.to_excel('./data/processed_data_extract.xlsx')
示例#8
0
with open('./data/Time Dataset.json') as f:
    dataset = json.load(f)
with open('./data/Time Vocabs.json') as f:
    human_vocab, machine_vocab = json.load(f)

human_vocab_size = len(human_vocab)
machine_vocab_size = len(machine_vocab)

# number of training examples
m = len(dataset)

# tokenize the data using vocabularies
Tx = 41  # Max x sequence length
Ty = 5  # y sequence length
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

# Split dat 80-20 between training and test
train_size = int(0.8 * m)
Xoh_train = Xoh[:train_size]
Yoh_train = Yoh[:train_size]
Xoh_test = Xoh[train_size:]
Yoh_test = Yoh[train_size:]

# Check the code works:
# i = 4
# print('Input data point',str(i),'.\n')
# print('The data input is :',str(dataset[i][0]))
# print('The data output is :',str(dataset[i][1]))
# print()
# print('The tokenized input is :',str(X[i]))