def main(args): print('Settings:') print(str(args)[10:-1]) transform_size = 1000 # Batch size to be used during transformation. num_features = args.features print('Loading data') train_x, train_y, dev_x, dev_y, dev_pid, test_x, test_y, test_pid = load_data( args.data, bytes=args.byte, preprocess=args.preprocessed) if args.no_recalc: # Fit vectorizer and transform source codes to n-gram bag vectors. print('Calculating vectors.') vec = train_vectorizer(train_x, args.mode, args.ngram, num_features) dump(vec, 'vectorizer.joblib' ) # Saves the vectorizer to be used in n-gram error analysis batch_transform(vec, train_x, 'train', num_features, transform_size) batch_transform(vec, dev_x, 'dev', num_features, transform_size) batch_transform(vec, test_x, 'test', num_features, transform_size) if args.mode == 'c': print("Rescaling count values.") rescale(num_features) else: print('Vector calculation skipped, loading from pre-calculated files.') predictions_dev, predictions_test, history = run_model( args.batch, num_features, train_y, dev_y, args.skip, args.fullpredict) if not args.skip: plot_history( history ) # Plot training and validation accuracy and loss per epoch. if args.results: write_predictions(predictions_dev, dev_pid, 'dev_predictions') write_predictions(predictions_test, test_pid, 'test_predictions')
def main(args): print('Settings:') print(str(args)[10:-1]) train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data(args.data, bytes=args.byte, preprocess=args.preprocessed) dev_orig = dev_x.copy() # [:2500] # To do shorter set, uncomment [:2500] print("Dataset loaded.") ngram_sizes = [int(n) for n in args.ngrams.split()] profile_lens = [int(n) for n in args.features.split()] print(ngram_sizes) print(profile_lens) # ngram_sizes = [2, 3] # profile_lens = [-1, 200] for ngram_size in ngram_sizes: author_profiles = {} dev_x = dev_orig.copy() for i in range(len(train_x)): single_profile = generate_profile(train_x[i], ngram_size) if train_y[i] in author_profiles: author_profiles[train_y[i]] = append_profile(author_profiles[train_y[i]], single_profile) else: author_profiles[train_y[i]] = single_profile author_profiles_backup = author_profiles.copy() # for author in author_profiles: # author_profiles[author] = dictionary_to_list(author_profiles[author]) dev_x = [dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x] ############################# for profile_len in profile_lens: author_profiles = author_profiles_backup.copy() # First load all the author ngrams and fix to the correct length for author in author_profiles: if profile_len >= 0: author_profiles[author] = set(dictionary_to_list(author_profiles[author])[:profile_len]) # Each author profile is now a set of the top profile_len number of features. elif profile_len == -1: # print("HYPER") auth_dict = author_profiles[author] # count dictionary for author keys = list(auth_dict) # list of ngrams for key in keys: # if a key only appears once, remove it if auth_dict[key] == 1: del auth_dict[key] author_profiles[author] = set(dictionary_to_list(author_profiles[author])) print("Running {}@{}".format(ngram_size, profile_len)) count_total = 0 count_success = 0 for i in range(len(dev_x)): actual = dev_y[i] result = compare_to_profiles(dev_x[i], author_profiles) count_total += 1 if actual == result: count_success += 1 print('Total Guesses: {}'.format(count_total)) print('Correct Guesses: {}'.format(count_success)) print('Guess accuracy: {}'.format(count_success / count_total)) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
def main(args): train_x, train_y, dev_x, dev_y, _, test_x, _, _ = load_data('../data_dir') features = 60770 # Number of unique words in training set if args.no_recalc: vectorizer = CountVectorizer(binary=True) # vectorizer = TfidfVectorizer() # Convert train, dev and test to 60770-D vectors train_x = vectorizer.fit_transform(train_x).astype('float32').toarray() # Convert train set to vector features = train_x.shape[1] t = np.memmap('vectors/train.mm', dtype='float32', mode='w+', shape=(50000, features)) t[:] = train_x[:] del t, train_x dev_x = vectorizer.transform(dev_x).astype('float32').toarray() d = np.memmap('vectors/dev.mm', dtype='float32', mode='w+', shape=(25000, features)) d[:] = dev_x[:] del d, dev_x test_x = vectorizer.transform(test_x).astype('float32').toarray() te = np.memmap('vectors/test.mm', dtype='float32', mode='w+', shape=(25000, features)) te[:] = test_x[:] del te, test_x t = np.memmap('vectors/train.mm', dtype='float32', mode='r', shape=(50000, features)) d = np.memmap('vectors/dev.mm', dtype='float32', mode='r', shape=(25000, features)) te = np.memmap('vectors/test.mm', dtype='float32', mode='r', shape=(25000, features)) # Setup generators train = Generator(t, train_y, 128) dev = Generator(d, dev_y, 128) test = GeneratorX(te, 128) # Model callback_list = [EarlyStopping(monitor='val_acc', patience=5), ModelCheckpoint(filepath='word_model.h5', monitor='val_acc', save_best_only=True), ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=3)] opt = RMSprop(learning_rate=0.001) model = Sequential() model.add(Dense(500, activation='relu', input_shape=(features,))) model.add(Dropout(0.5)) model.add(Dense(1000, activation='softmax')) model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['acc']) model.summary() model.fit(train, epochs=1000, validation_data=dev, callbacks=callback_list) model.load_weights('word_model.h5') model.evaluate(dev) # Write dev and test predictions to file. predict_vec = np.memmap('vectors/dev_word.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec[:] = model.predict(dev)[:] del predict_vec predict_vec2 = np.memmap('vectors/test_word.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec2[:] = model.predict(test)[:] del predict_vec2
def __data_generation(self, filenames): # Load a batch of data X, y = datatools.load_data( path_to_dataset=self.path_to_dataset, data_list=filenames, input_shape=self.dim, standardization_mode=self.standardization_mode, border=self.border) # Debugging # if self.val == False: # f = open('traingen.log', 'a+') # f.write('-------------------->New Epoch\n') # for i in range(len(filenames)): # f.write(filenames[i]+'\n') # f.close() # else: # f = open('valgen.log', 'a+') # f.write('-------------------->New Epoch\n') # for i in range(len(filenames)): # f.write(filenames[i]+'\n') # f.close() # if self.standardization_mode != None: # standardize = True # #print('Datagen performing standardization...') # else: # standardize = False # #print('Datagen without standardization...') # X, y = datatools.load_data2(path_to_dataset=self.path_to_dataset, # data_list=filenames, input_shape=self.dim, # standardize=standardize, # border=self.border) # Scale the data y = y * self.linear_output_scaling_factor # Expand the dimension for channels X = X[:, :, :, :, np.newaxis] y = y[:, :, :, :, np.newaxis] return X, y
def main(): train_x, _, dev_x, _, _, test_x, _, _ = load_data(r'../data_dir/', bytes=False, preprocess=True) del _ print("Tokenizing.") length = 100 start = time.time() processes = [ Process(target=tokenize, args=(train_x, 'train', length)), Process(target=tokenize, args=(dev_x, 'dev', length)), Process(target=tokenize, args=(test_x, 'test', length)) ] for p in processes: p.start() for p in processes: p.join() print("Finalising writing tokens to file.") print(time.time() - start)
import sys sys.path.append('..') from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau import numpy as np from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, MaxPool1D from tensorflow.keras.models import Sequential from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.utils import plot_model from tools.datatools import load_data train_x, train_y, dev_x, dev_y, _, test_x, _, _ = load_data('../data_dir') # Word CNN model. tokenizer = Tokenizer( num_words=2048) # Tokenize source codes using top 2048 words tokenizer.fit_on_texts(train_x) train_y = np.array(train_y) dev_y = np.array(dev_y) train_x = tokenizer.texts_to_sequences(train_x) dev_x = tokenizer.texts_to_sequences(dev_x) train_x = pad_sequences(train_x, maxlen=512) # Pad sequences to uniform length dev_x = pad_sequences(dev_x, maxlen=512) callback_list = [ EarlyStopping(monitor='val_acc', patience=5), ModelCheckpoint(filepath='word_model2.h5', monitor='val_acc', save_best_only=True),
def main(args): start = time.time() print('Settings:') print(str(args)[10:-1]) ngram_size = args.ngram profile_len = args.features author_profiles = {} # author n-gram profiles 0-999 # _, train_x, train_y = prep_inputs('train', bytes=True) # _, dev_x, dev_y = prep_inputs('dev', bytes=True) train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data( '../data_dir/', bytes=args.byte, preprocess=args.preprocessed) print("Dataset loaded.") for i in range(len(train_x)): single_profile = generate_profile( train_x[i], ngram_size) # creates profile for each code file if train_y[ i] in author_profiles: # appends to existing author profile or creates a new one if it doesn't # already exist author_profiles[train_y[i]] = append_profile( author_profiles[train_y[i]], single_profile) else: author_profiles[train_y[i]] = single_profile for author in author_profiles: if profile_len >= 0: author_profiles[author] = set( dictionary_to_list(author_profiles[author])[:profile_len]) # Each author profile is now a set of the top profile_len number of features. elif profile_len == -1: print("HYPER") auth_dict = author_profiles[author] # count dictionary for author keys = list(auth_dict) # list of ngrams for key in keys: # if a key only appears once, remove it if auth_dict[key] == 1: del auth_dict[key] author_profiles[author] = set( dictionary_to_list(author_profiles[author])) print("Author profiles ready.") count_total = 0 count_success = 0 dev_x = [ dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x ] print('Dev ready for comparisons.') start_time = time.time() for i in range(len(dev_x)): actual = dev_y[i] result = compare_to_profiles(dev_x[i], author_profiles) count_total += 1 if actual == result: count_success += 1 if ((i + 1) % 250) == 0: percent = int((i + 1) / 250) print("Progress: {}%".format(percent)) print("Accuracy so far: {}".format(count_success / count_total)) time_secs = ( (time.time() - start_time) / percent) * (100 - percent) time_mins = int(time_secs // 60) time_secs = str(int(time_secs % 60)).zfill(2) print("Time remaining: {}:{}".format(time_mins, time_secs)) print('Total Guesses: {}'.format(count_total)) print('Correct Guesses: {}'.format(count_success)) print('Guess accuracy: {}'.format(count_success / count_total)) print('n-grams: {}'.format(ngram_size)) print('Profile length: {}'.format(profile_len)) print(time.time() - start)
linear_output_scaling_factor = 409600000000 path_to_dataset = os.path.join('..', '..', '..', 'Daten', 'dataset_size32_stride16_split') data_list = datatools.get_balanced_dataset(path_to_dataset=path_to_dataset, clip=5000) # Shuffle the dataset np.random.shuffle(data_list) train_list = data_list[0:10000] val_list = data_list[10000:15000] test_list = data_list[15000:200000] X_train, y_train = datatools.load_data(path_to_dataset=path_to_dataset, data_list=train_list, input_shape=(32, 32, 32), standardization_mode='per_sample', border=None) X_val, y_val = datatools.load_data(path_to_dataset=path_to_dataset, data_list=val_list, input_shape=(32, 32, 32), standardization_mode='per_sample', border=None) X_test, y_test = datatools.load_data(path_to_dataset=path_to_dataset, data_list=test_list, input_shape=(32, 32, 32), standardization_mode=None, border=None) # Expand the dimensions for channels X_train = X_train[:, :, :, :, np.newaxis]
def main(args): start = time.time() print('Settings:') print(str(args)[10:-1]) ngram_size = args.ngram profile_len = args.features author_profiles = {} # author n-gram profiles 0-999 # _, train_x, train_y = prep_inputs('train', bytes=True) # _, dev_x, dev_y = prep_inputs('dev', bytes=True) train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data( '../data_dir/', bytes=args.byte, preprocess=args.preprocessed) print("Dataset loaded.") for i in range(len(train_x)): single_profile = generate_profile( train_x[i], ngram_size) # creates profile for each code file if train_y[ i] in author_profiles: # appends to existing author profile or creates a new one if it doesn't # already exist author_profiles[train_y[i]] = append_profile( author_profiles[train_y[i]], single_profile) else: author_profiles[train_y[i]] = single_profile # for author in author_profiles: # author_profiles[author] = set(dictionary_to_list(author_profiles[author])[:profile_len]) for author in author_profiles: if profile_len >= 0: author_profiles[author] = set( dictionary_to_list(author_profiles[author])[:profile_len]) # Each author profile is now a set of the top profile_len number of features. elif profile_len == -1: auth_dict = author_profiles[author] # count dictionary for author keys = list(auth_dict) # list of ngrams for key in keys: # if a key only appears once, remove it if auth_dict[key] == 1: del auth_dict[key] author_profiles[author] = set( dictionary_to_list(author_profiles[author])) # lowest=999999999 # highest=0 # cx=0 # for a in author_profiles: # cx+=len(author_profiles[a]) # lowest=min(lowest,len(author_profiles[a])) # highest = max(highest, len(author_profiles[a])) # print(cx) # print(lowest) # print(highest) # exit() print("Author profiles ready.") dev_x = [ dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x ] print('Dev ready for comparisons.') start_time = time.time() size = int(25000 / cpu_count()) smaller_chunks = [dev_x[x:x + size] for x in range(0, len(dev_x), size)] labels = [dev_y[x:x + size] for x in range(0, len(dev_y), size)] with ProcessPoolExecutor() as executor: results = [ executor.submit(calculate_profiles, smaller_chunks[i], labels[i], author_profiles, i) for i in range(len(smaller_chunks)) ] total = 0 success = 0 for r in as_completed(results): r = r.result() success += r[0] total += r[1] print(success) print(total) print(success / total) print(time.time() - start_time)
standardization_mode=standardization_mode, linear_output_scaling_factor=linear_output_scaling_factor, border=border) history = cnn.fit_generator(epochs=epochs, train_generator=train_generator, val_generator=val_generator, callbacks=callbacks) #%%############################################################################ # Evaluate the model ############################################################################### # Load unstandardized test data X_test_data, y_test_data = datatools.load_data(path_to_dataset=path_to_dataset, data_list=test_list, input_shape=data_shape, standardization_mode=None, border=border) if evaluate == True: test_loss = cnn.evaluate_model(X_test=np.expand_dims(X_test_data, axis=4), y_test=np.expand_dims(y_test_data, axis=4), batch_size=batch_size) print(test_loss) #%%############################################################################ # Save the model ############################################################################### cnn.save_model_json(model_export_path, 'model_json') cnn.save_model_weights(model_export_path, 'model_weights') cnn.save_model_single_file(model_export_path, 'model_single')
def main(args): print('Settings:') print(str(args)[10:-1]) train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data( args.data, bytes=args.byte, preprocess=args.preprocessed) dev_orig = dev_x.copy() # [:2500] # To do shorter set, uncomment [:2500] print("Dataset loaded.") ngram_sizes = [int(n) for n in args.ngrams.split()] profile_lens = [int(n) for n in args.features.split()] print(ngram_sizes) print(profile_lens) for ngram_size in ngram_sizes: author_profiles = {} dev_x = dev_orig.copy() for i in range(len(train_x)): single_profile = generate_profile(train_x[i], ngram_size) if train_y[i] in author_profiles: author_profiles[train_y[i]] = append_profile( author_profiles[train_y[i]], single_profile) else: author_profiles[train_y[i]] = single_profile author_profiles_backup = author_profiles.copy() dev_x = [ dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x ] ############################# for profile_len in profile_lens: author_profiles = author_profiles_backup.copy( ) # First load all the author ngrams and fix to the correct length for author in author_profiles: if profile_len >= 0: author_profiles[author] = set( dictionary_to_list( author_profiles[author])[:profile_len]) # Each author profile is now a set of the top profile_len number of features. elif profile_len == -1: # print("HYPER") auth_dict = author_profiles[ author] # count dictionary for author keys = list(auth_dict) # list of ngrams for key in keys: # if a key only appears once, remove it if auth_dict[key] == 1: del auth_dict[key] author_profiles[author] = set( dictionary_to_list(author_profiles[author])) print("Running {}@{}".format(ngram_size, profile_len)) start_time = time.time() processes = cpu_count() size = int(25000 / processes) smaller_chunks = [ dev_x[x:x + size] for x in range(0, len(dev_x), size) ] labels = [dev_y[x:x + size] for x in range(0, len(dev_y), size)] with ProcessPoolExecutor() as executor: results = [ executor.submit(calculate_profiles, smaller_chunks[i], labels[i], author_profiles) for i in range(len(smaller_chunks)) ] total = 0 success = 0 for r in as_completed(results): r = r.result() success += r[0] total += r[1] executor.shutdown(wait=True) # print(success) # print(total) # print(success / total) print(time.time() - start_time) print('Total Guesses: {}'.format(total)) print('Correct Guesses: {}'.format(success)) print('Guess accuracy: {}'.format(success / total)) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' )
def main(args): print('Settings:') print(str(args)[10:-1]) length = 136 print('Loading data...') if args.no_recalc: train_x, _, dev_x, _, _, test_x, _, _ = load_data( '../data_dir' ) # Makes use of both raw and preprocessed source codes. train_x2, _, dev_x2, _, _, test_x2, _, _ = load_data('../data_dir', preprocess=True) print("Extracting stylometric features...") vec = Vectorizer( 'lexical' ) # Runs the stylometry_vectorizer from the vectorizer.py file so characters can be grabbed # simultaneously. train_x = vec.vectorize(train_x, train_x2) # Vectorize all 3 subsets dev_x = vec.vectorize(dev_x, dev_x2) test_x = vec.vectorize(test_x, test_x2) del train_x2, dev_x2, test_x2 scaler = MinMaxScaler() # Rescale values between 0 and 1. print("Rescaling...") train_x = scaler.fit_transform(train_x) dev_x = scaler.transform(dev_x) test_x = scaler.transform(test_x) length = len(train_x[0]) print(length) trainmm = np.memmap('vectors/train.mm', dtype='float32', mode='w+', shape=(50000, length)) trainmm[:] = train_x[:] devmm = np.memmap('vectors/dev.mm', dtype='float32', mode='w+', shape=(25000, length)) devmm[:] = dev_x[:] testmm = np.memmap('vectors/test.mm', dtype='float32', mode='w+', shape=(25000, length)) testmm[:] = test_x[:] del trainmm, devmm, testmm, train_x, dev_x, test_x # Save and flush all vectors. print("Finished building vectors.") # Load data from file. train_y, dev_y, _ = load_all_labels('../data_dir') dev = np.array( np.memmap('vectors/dev.mm', dtype='float32', mode='r', shape=(25000, length))) test = np.array( np.memmap('vectors/test.mm', dtype='float32', mode='r', shape=(25000, length))) train = np.array( np.memmap('vectors/train.mm', dtype='float32', mode='r', shape=(50000, length))) # Model. callback_list = [ EarlyStopping(monitor='val_acc', patience=10), ModelCheckpoint(filepath='style_model.h5', monitor='val_acc', save_best_only=True), ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=5) ] model = Sequential() model.add(Dense(500, activation='relu', input_shape=(136, ))) model.add(Dropout(0.3)) model.add(Dense(500, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(1000, activation='softmax')) opt = RMSprop(learning_rate=0.001) model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['acc']) model.summary() model.fit(train, train_y, epochs=1000, batch_size=250, validation_data=(dev, dev_y), shuffle=True, callbacks=callback_list) model = load_model('style_model.h5') print(model.evaluate(dev, dev_y)) # Generate predictions. predict_vec = np.memmap('vectors/dev_style.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec[:] = model.predict(dev)[:] del predict_vec predict_vec2 = np.memmap('vectors/test_style.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec2[:] = model.predict(test)[:] del predict_vec2