def main(): if not os.path.isdir(DATASETS): raise FileNotFoundError('Folder given to save datasets does not exist.') datasets = sys.argv[1:] # use datasets given by user for month in set(datasets): filename = download_data(month) out_path = os.path.join(DATASETS, filename) preprocess_data(filename, out_path) print(f'Successfully downloaded and preproccessed {len(set(datasets))} file(s).')
def generate_clean_data(): train_data, test_data = get_raw_data.classify_data(ORIGINAL_FILE_PATH) print("The data is being processed right now. Please wait....") train_data = pre.preprocess_data(train_data) train_data.to_csv("./data/train.csv", sep=",", encoding="utf-8", index=False) test_data = pre.preprocess_data(test_data) test_data.to_csv("./data/test.csv", sep=",", encoding="utf-8", index=False) use_classifier()
def run_mr(use_word_to_vec=True): data, labels = data_importer.load_data_and_labels_mr() word_to_index_mapping, vectors = w2v.load_mr_word_vectors() train_data, train_labels, vocabulary, test_data, test_labels, embedding_vectors = \ pre.preprocess_data(data, labels, word_to_index_mapping, vectors) if not use_word_to_vec: embedding_vectors = None t.train(train_data, train_labels, vocabulary, test_data, test_labels, embedding_vectors)
def test_compressing_without_output(self): """Test wheather compressed data would be the same after decompression if `output` parameter is not passed""" new_path = os.path.join(os.path.dirname(path), '_temp_' + os.path.basename(path)) shutil.copyfile( path, new_path) # copy file in order not to modify original file file_path = preprocess_data(new_path) self._test_compressing_helper(file_path)
def quick_explore(folder='folder_name'): # Imports import pandas as pd import matplotlib.pyplot as plt import os from preprocessor import preprocess_data directory = '../../data/' + folder + '/' for filename in os.listdir(directory): filepath = os.path.join(directory, filename) # Read CSV files if filename.endswith('.csv'): raw_df = pd.read_csv(filepath, infer_datetime_format=True) neg_index = 4 # Read Excel files elif filename.endswith('.xlsx'): raw_df = pd.read_excel(filepath, infer_datetime_format=True) neg_index = 5 # Skip other file types else: continue print() print('--------------' * 8) print('--------------' * 8) print(filename[:-neg_index].upper()) print() df, redund_dict = preprocess_data(raw_df) return
def test_compressing_with_output(self): """Test wheather compressed data would be the same after decompression if `output` parameter is passed """ file_path = preprocess_data(path, '_temp', delete_original=False) self._test_compressing_helper(file_path)
import os os.chdir(os.path.dirname(os.path.abspath(__file__))) #os.chdir('local\\directory') import scraper import preprocessor import loader data = scraper.scrape_data() data = preprocessor.preprocess_data(data) loader.gsheets_upload(data) ### saving local copy of data #data.to_excel('./data/processed_data_extract.xlsx')
with open('./data/Time Dataset.json') as f: dataset = json.load(f) with open('./data/Time Vocabs.json') as f: human_vocab, machine_vocab = json.load(f) human_vocab_size = len(human_vocab) machine_vocab_size = len(machine_vocab) # number of training examples m = len(dataset) # tokenize the data using vocabularies Tx = 41 # Max x sequence length Ty = 5 # y sequence length X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty) # Split dat 80-20 between training and test train_size = int(0.8 * m) Xoh_train = Xoh[:train_size] Yoh_train = Yoh[:train_size] Xoh_test = Xoh[train_size:] Yoh_test = Yoh[train_size:] # Check the code works: # i = 4 # print('Input data point',str(i),'.\n') # print('The data input is :',str(dataset[i][0])) # print('The data output is :',str(dataset[i][1])) # print() # print('The tokenized input is :',str(X[i]))