def valid(mdl: FeedforwardNetwork, data_iter: utils.BatchIterator, LANG, args): with torch.no_grad(): preds = [] golds = [] for i, batch in enumerate( tqdm.tqdm(data_iter, total=data_iter.__len__())): pred = run_iter(mdl, batch, None, False, None) preds.extend(LANG.decode(pred)) golds.extend(LANG.decode(batch['lang'])) acc = accuracy_score(golds, preds) * 100 f1 = f1_score(golds, preds, average='macro') * 100 precision = precision_score(golds, preds, average='macro') * 100 recall = recall_score(golds, preds, average='macro') * 100 res = { 'acc': round(acc, 2), 'f1': round(f1, 2), 'precision': round(precision, 2), 'recall': round(recall, 2) } report = classification_report(golds, preds, digits=4) utils.save_txt( report, os.path.join( args.mdir, f'report-acc{acc:.2}-' f'f1{f1:.2}-' f'p{precision:.2}-' f'r{recall:.2}.txt')) return res
def create_training_labels(type_, path=os.path.join(path_train, "semcor+omsti.json")): ############################################################################### # This function creates a txt file with sentences with a specific label # and a vocabulary with all the seen labels. # # Input: # type_: it is a laabel used to choose the type of label # path: path of the json file # # Output: # None ############################################################################### # create a list with sentences of the considered labels for all the training set dictionary = create_labels_words() data = utils.load_json(path) data = list( map(partial(sentence_from_dictionaries, training_sentence=False), data)) sentences = [] labels = set() for sentence in data: single_sentence = [] for word in sentence.split(): # insert the current word if type(dictionary.get(word, word)) != list: single_sentence.append(word) # insert the corrispondent label for the current word else: single_sentence.append(str(dictionary.get(word, word)[type_])) labels.add(str(dictionary.get(word, word)[type_])) sentences.append(single_sentence) # create the vocabulary of seen labels, adding the ids for the padding, the unseen labels and the unlabelled words vocabulary = { value: key for key, value in dict(enumerate(labels, 3)).items() } vocabulary["<PAD>"] = "0" vocabulary["<UNSEEN>"] = "1" vocabulary["<WORD>"] = "2" # exchange strings with ids sentences = list( map( lambda sentence: ' '.join( str(vocabulary.get(word, word)) for word in sentence), sentences)) utils.save_txt(sentences, path_train + name_training_file[type_][0]) utils.save_pickle(vocabulary, "../resources/" + name_training_file[type_][1])
def predict_wordnet_domains(input_path: str, output_path: str, resources_path: str) -> None: """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <wordnetDomain>" format (e.g. "d000.s000.t000 sport"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ path_json = os.path.join(resources_path, "Test", 'test.json') path_model = os.path.join(resources_path, 'model', 'model_coarsegrained') # parse the xml file parsing.parsing_datasets(input_path, path_json) # creates candidates and input file for the model input_, candidates, ids_candidates = create_batched_inputs(path_json) ids = take_id_istances(resources_path) final_prediction = [] with tf.Session() as sess: # load the model output_wndomain, _, inputs_, input_int, keep_prob = load_model_coarsegrained( sess, path_model) for batch_num in range(len(input_[0])): # do the prediction preds_wndomain = sess.run(output_wndomain, feed_dict={ inputs_: input_[0][batch_num], input_int: input_[1][batch_num], keep_prob: 1. }) final_prediction += calculate_prediction(input_[1][batch_num], preds_wndomain, candidates[1][batch_num], ids_candidates[1], vocab_wndomain) combined = list(zip(ids, final_prediction)) # save the prediction utils.save_txt( list(map(lambda word: word[0] + " " + word[1], combined)), output_path) tf.reset_default_graph() pass
def all_operations(in_dir, number_of_images, out_dir): def shoggoth(img_num): image = get_image(img_num, in_dir) # Wykrycie narożników kartki, skorygowanie perspektywy oraz wycięcie kartki. image_perspective_operations, sPoints, tPoints = perspective_operations( image) # Usunięcie kratek z kartki tak, żeby zostały widoczne tylko słowa image_remove_lines_operations = removeLines( image_perspective_operations) image_checkered_operations = checkered_operations( image_remove_lines_operations) # Wykrycie słów image_detect_words = detect_words_operations( image_checkered_operations) # Wykrycie indeksów image_crop_numbers, cropped_numbers = crop_numbers( image_detect_words, image_perspective_operations) # Podział na cyfry image_crop_digits = crop_digits(cropped_numbers, image) cropped_rectangles = crop_small_rectangles(image_detect_words) mask = paint_lines(cropped_rectangles) mask = prepare_mask(mask) new_img = move_to_original_coords(mask, sPoints, tPoints, image.shape[:2]) save_image(img_num, new_img, "png", out_dir) return image_crop_digits image_range = range(1, number_of_images + 1) image_crop_digits = [] for image_num in tqdm(image_range): image_crop_digits.append(shoggoth(image_num)) index_array = [] model = tf.keras.models.load_model(pretict_model_path) for page_number, page in tqdm(enumerate(image_crop_digits)): index_array.append([]) for index_number, image_number in enumerate(page): index_array[page_number].append('') for index_digit, image_digit in enumerate(image_number): predicted_digit = predict_image(image_digit, model) index_array[page_number][index_number] += str(predicted_digit) index_array[page_number].reverse() for image_num, page in enumerate(index_array): save_txt((image_num + 1), page, out_dir) print('page: {}'.format(page))
def main(): if (EXPERIMENT_NAME not in os.listdir()): os.mkdir(EXPERIMENT_NAME) for feature in FEATURES: try: data = np.load(feature + "_stats.npy", allow_pickle=True).item() pca = joblib.load("pca_" + feature + "_stats") train_data, train_data_classes = make_train_data(data, True) test_data, test_data_classes = make_test_data(data) train_data_pca = np.array(pca.transform(train_data)) test_data_pca = np.array(pca.transform(test_data)) for c in C_VALUES: if (feature, c) in SKIP_COMBINATIONS: print("Skipping " + feature + " SVM-linear C = " + str(c)) continue print("Computing " + feature + " SVM-linear C = " + str(c)) res, model = experiment_svm_linear(train_data_pca, train_data_classes, test_data_pca, test_data_classes, c, TRAIN_VERBOSE) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + str( c) + "_results" path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(res, path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + str( c) + "_results.txt" path = os.path.join(EXPERIMENT_NAME, filename) save_txt(res, path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + str( c) + "_model" path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(model, path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " " + feature + " SVM-linear C = " + str(c)) print(e) pass
def _recons(self, sess, result_file, recons_file): x = self.test_x[0:self.batch_size] y = self.test_y[0:self.batch_size] feed_dict = { self.capsNet.x: x, self.capsNet.labels: y } if self.capsNet.recon_with_y else { self.capsNet.x: x } masked_v, decoded = sess.run( [self.capsNet.recons_input, self.capsNet.decoded], feed_dict=feed_dict) save_txt(masked_v, recons_file_name=recons_file) save_images(decoded, result_file_name=result_file, height_number=8) pass
def create_training_sentence(path=os.path.join(path_train, "semcor+omsti.json")): ############################################################################### # This function creates a txt file with all the training sentences of strings, # a txt file with all the training sentences of integers and two vocabularies, # one of them with all the seen lemma and the other with all the seen words # in the training set. # # Input: # path: path of the json file # # Output: # None ############################################################################### data = utils.load_json(path) # create the dictionaries of lemmas and words dictionary_train = create_training_dictionary(data) dictionary_lemmas = create_dictionary_lemmas(data) data = [ sentence for sentence in list( map(partial(sentence_from_dictionaries, training_sentence=True), data)) ] print(max([len(sentence) for sentence in data])) # exchange the words in the training set with their ids, put one for the OOV data_w_ids = list( map( lambda sentence: " ".join( [dictionary_train.get(word, "1") for word in sentence]), data)) utils.save_txt([" ".join(sentence) for sentence in data], path_train + "semcor_omsti_string.txt") utils.save_txt(data_w_ids, path_train + "semcor_omsti.txt") utils.save_pickle(dictionary_train, "../resources/vocabulary.pickle") utils.save_pickle(dictionary_lemmas, "../resources/vocabulary_lemmas.pickle")
def create_training_POS_labels( path=os.path.join(path_train, "semcor+omsti.json"), type_=3): ############################################################################### # This function creates a txt file with sentences with POS labels # and a vocabulary with all the seen POS labels. # # Input: # path: path of the json file # type_: it is a label used to select the type of the output # # Output: # None ############################################################################### # create a list with sentences of POS labels for all the training set data = utils.load_json(path) data = [sentence.split() for sentence in list(map(label_POS, data))] # create the vocabulary of seen POS labels, adding the ids for the padding and the unseen labels dictionary_POS = { value: str(key) for key, value in dict( enumerate( set(itertools.chain.from_iterable(data)) - {"<UNSEEN>"}, 2)).items() } dictionary_POS["<PAD>"] = "0" dictionary_POS["<UNSEEN>"] = "1" # exchange strings with their ids data = list( map( lambda sentence: " ".join( [dictionary_POS.get(word, word) for word in sentence]), data)) utils.save_txt(data, path_train + name_training_file[type_][0]) utils.save_pickle(dictionary_POS, "../resources/" + name_training_file[type_][1])
def create_labels_words(path_input, save_gt=False): ############################################################################### # This function, given the gold file, creates a dictionary of labels for each # ambiguous words (babelnet id, Wndomain, Lexname) and if save_gt is True, # this function saves the ground truth # # Input: # path: path of the gold file # save_gt: if True, the ground truth is saved, otherwise no # # Output: # dict_sensekey: dictionary of labels ############################################################################### sense_keys = [sensekey.split() for sensekey in utils.load_txt(path_input)] dict_sensekey = {} for list_info in sense_keys: # take the synset from the sense key synset = wn.lemma_from_key(list_info[1]).synset() # take the wordnet id from the sense key wn_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos() bn_id = Wordnet_match[wn_id] try: dict_sensekey[list_info[0]] = [ bn_id, Wndomain_match[bn_id], lexname_match[bn_id] ] # add the factotum label to all the words which don't have a wndomain label except: dict_sensekey[list_info[0]] = [ bn_id, "factotum", lexname_match[bn_id] ] # save the ground truth if save_gt: name_words = list(dict_sensekey.keys()) gt_words = list(dict_sensekey.values()) combined = list(zip(name_words, gt_words)) utils.save_txt( list(map(lambda word: word[0] + " " + word[1][0], combined)), "../resources/Test/fine_grained_gt.txt") utils.save_txt( list(map(lambda word: word[0] + " " + word[1][1], combined)), "../resources/Test/wndomain_gt.txt") utils.save_txt( list(map(lambda word: word[0] + " " + word[1][2], combined)), "../resources/Test/lexname_gt.txt") return None return dict_sensekey
def main(): ADS_PATH = sys.argv[1] TV_PATH = sys.argv[2] if config.TIME_FLAG: start_time = time.time() # GET DESCRIPTORS if not config.READ_DESCRIPTORS: #obtener todos los paths de los comerciales ADS = utils.getMPEGfiles(ADS_PATH) # Inicializar la clase EHD para obtener los descriptores # de los comerciales y el video de TV EHD = utils.EHD_DESCRIPTOR() ADS_DESCRIPTORS, N_FRAMES_ADS = EHD.getDescriptors( ADS, title='ADS_DESCRIPTORS.dat') #ADS DESCRIPTOR = VIDEOS x FRAMES x DESCRIPTOR TV_DESCRIPTORS, N_FRAMES_TV = EHD.getDescriptors( [TV_PATH], title='TV_DESCRIPTORS.dat') # obtener numero de video, basename y duracion ADS_INFO = utils.ads_information(ADS, N_FRAMES_ADS) # Guardar descriptores if config.SAVE_DESCRIPTORS: np.save('ADS_DESCRIPTORS.npy', ADS_DESCRIPTORS) np.save('TV_DESCRIPTORS.npy', TV_DESCRIPTORS) np.save('N_FRAMES_ADS.npy', N_FRAMES_ADS) np.save('N_FRAMES_TV', N_FRAMES_TV) print("SAVED SUCCEED") else: # Leer descriptores ADS = utils.getMPEGfiles(ADS_PATH) ADS_DESCRIPTORS = np.load('ADS_DESCRIPTORS.npy') TV_DESCRIPTORS = np.load('TV_DESCRIPTORS.npy') N_FRAMES_ADS = np.load('N_FRAMES_ADS.npy') ADS_INFO = utils.ads_information(ADS, N_FRAMES_ADS) # GET NEIGHBORS # Obtener los k(3) vecinos mas cercanos para cada frame del video de TV y guardar los resultados if not config.READ_DESCRIPTORS: k_nearest_neighbors = [] GET_NEIGHBORS = utils.K_NEIGHBORS() for tv_video in TV_DESCRIPTORS: for frame_descriptor in tv_video: k_nearest_neighbors.append( GET_NEIGHBORS.get_k_nearest_neighbors( frame_descriptor, ADS_DESCRIPTORS)) k_nearest_neighbors = np.array(k_nearest_neighbors) if config.SAVE_DESCRIPTORS: np.save('vecinos.npy', k_nearest_neighbors) else: # Leer los resultados k_nearest_neighbors = np.load('vecinos.npy') # Separar los resultados en distintas list para una manipulacion mas sencilla tiempo_array = [] videos_array = [] frames_array = [] for i, element in enumerate(k_nearest_neighbors): #print(i,element, ADS[element[0][0]]) tiempo_array.append(i) videos_array.append(element[0][0]) frames_array.append(element[0][1]) tiempo_array = np.array(tiempo_array) frames_array = np.array(frames_array) videos_array = np.array(videos_array) #utils.pendiente(tiempo_array, frames_array) # Generar las detecciones y guardar los resultados DETECCIONES = utils.algoritmo_deteccion(tiempo_array, frames_array, videos_array, ADS_INFO) utils.save_txt(DETECCIONES, ADS_INFO, TV_PATH) if config.DRAW: for i in range(21): t = tiempo_array[videos_array == i] f = frames_array[videos_array == i] if f.shape[0] > 0: fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(t, f, c='r', marker='o') ax.set_xlabel('TIEMPO') ax.set_ylabel('FRAMES') ax.set_title('COMERCIAL N: {}'.format(i + 1)) plt.show() if config.TIME_FLAG: print("Total time: {}".format((time.time() - start_time) / 60))
def main(): for feature, c in COMBINATIONS: try: print("Computing " + feature + " SVM-linear C = " + str(c) + " cross validation") data = np.load(feature + "_stats.npy", allow_pickle=True).item() meditation = [] music_video = [] logic_game = [] for key in data.keys(): for item in data[key]['meditation']: meditation.append(item) for item in data[key]['logic_game']: logic_game.append(item) for item in data[key]['music_video']: music_video.append(item) if len(meditation) != len(music_video) or len(music_video) != len( logic_game): raise Exception("Classes have unequal number of samples") len1 = len(meditation) x = meditation + logic_game + music_video y = (len1 * ['meditation']) + (len1 * ['logic_game']) + ( len1 * ['music_video']) if len(y) != len(x): raise Exception( "Unequal number of input samples and output samples") kfold = KFold(n_splits=10, shuffle=True) splits = list(kfold.split(x, y)) if EXPERIMENT_NAME not in os.listdir(): os.mkdir(EXPERIMENT_NAME) if SAVE_SPLITS: np.save( os.path.join( EXPERIMENT_NAME, EXPERIMENT_NAME + "_" + feature + "_splits.npy"), splits) predicted_list = np.ndarray(0) test_data_classes_list = np.ndarray(0) for i in range(len(splits)): print("Computing " + numeral_str(i + 1) + " fold") train_data, train_data_classes, test_data, test_data_classes = make_crossvalidation_data( splits[i], x, y) pca = PCA(n_components=0.95) pca.fit(train_data) pca_list.append(pca) train_data_pca = np.array(pca.transform(train_data)) test_data_pca = np.array(pca.transform(test_data)) res, model = experiment_svm_linear(train_data_pca, train_data_classes, test_data_pca, test_data_classes, c, TRAIN_VERBOSE) results_list.append(res) models_list.append(model) test_data_classes_list = np.append(test_data_classes_list, test_data_classes) predicted_list = np.append(predicted_list, res['results']) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + dots_to_underscores( str(c)) + "_results_" + str(i + 1) path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(res, path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + dots_to_underscores( str(c)) + "_results_" + str(i + 1) + ".txt" path = os.path.join(EXPERIMENT_NAME, filename) save_txt(res, path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + dots_to_underscores( str(c)) + "_model_" + str(i + 1) path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(model, path) if SAVE_PCA: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + dots_to_underscores( str(c)) + "_pca_" + str(i + 1) path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(pca, path) if SAVE_RESULTS or SAVE_RESULTS_TXT: average_report = make_report(test_data_classes_list, predicted_list) if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + dots_to_underscores( str(c)) + "_results_averaged" path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(average_report, path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_" + feature + " svm_lin_c_" + dots_to_underscores( str(c)) + "_results_averaged.txt" path = os.path.join(EXPERIMENT_NAME, filename) save_txt(average_report, path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " " + feature + " SVM-linear C = " + dots_to_underscores(str(c))) print(traceback.format_exc()) # print(e) pass
def main(): for feature in FEATURES: try: print( "Computing " + feature + " multiple hidden layer neural network (specification 1) cross validation" ) data = np.load(feature + "_stats.npy", allow_pickle=True).item() meditation = [] music_video = [] logic_game = [] for key in data.keys(): for item in data[key]['meditation']: meditation.append(item) for item in data[key]['logic_game']: logic_game.append(item) for item in data[key]['music_video']: music_video.append(item) if len(meditation) != len(music_video) or len(music_video) != len( logic_game): raise Exception("Classes have unequal number of samples") len1 = len(meditation) x = meditation + logic_game + music_video y = (len1 * ['meditation']) + (len1 * ['logic_game']) + ( len1 * ['music_video']) if len(y) != len(x): raise Exception( "Unequal number of input samples and output samples") kfold = KFold(n_splits=10, shuffle=True) splits = list(kfold.split(x, y)) encoder = LabelEncoder() encoder.fit(y) if EXPERIMENT_NAME not in os.listdir(): os.mkdir(EXPERIMENT_NAME) if SAVE_SPLITS: np.save( os.path.join( EXPERIMENT_NAME, EXPERIMENT_NAME + "_" + feature + "_splits.npy"), splits) if SAVE_ENCODER: joblib.dump( encoder, os.path.join(EXPERIMENT_NAME, EXPERIMENT_NAME + "_" + feature + "_encoder")) predicted_list = np.ndarray(0) test_data_classes_list = np.ndarray(0) for i in range(len(splits)): print("Computing " + numeral_str(i + 1) + " fold") initial_train_data, initial_train_data_classes, test_data, test_data_classes = make_crossvalidation_data( splits[i], x, y) l = len(initial_train_data) i1 = int(0.8889 * l) indices = [i for i in range(l)] random.shuffle(indices) train_val_indices = [indices[:i1], indices[i1:]] train_data, train_data_classes, val_data, val_data_classes = make_crossvalidation_data( train_val_indices, initial_train_data, initial_train_data_classes) if (SAVE_SPLITS): np.save( os.path.join( EXPERIMENT_NAME, EXPERIMENT_NAME + "_" + feature + "_test_val_splits_" + str(i) + ".npy"), train_val_indices) pca = PCA(n_components=0.95) pca.fit(train_data) pca_list.append(pca) train_data_pca = np.array(pca.transform(train_data)) val_data_pca = np.array(pca.transform(val_data)) test_data_pca = np.array(pca.transform(test_data)) res, model = experiment_mlp_1a(train_data_pca, train_data_classes, val_data_pca, val_data_classes, test_data_pca, test_data_classes, encoder, TRAIN_VERBOSE) results_list.append(res) models_list.append(model) test_data_classes_list = np.append(test_data_classes_list, test_data_classes) predicted_list = np.append(predicted_list, res['results_decoded']) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_multi_layer_spec_1_results_" + str( i + 1) path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(res, path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_multi_layer_spec_1_results_" + str( i + 1) + ".txt" path = os.path.join(EXPERIMENT_NAME, filename) save_txt(res, path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_multi_layer_spec_1_" + str( i + 1) path = os.path.join(EXPERIMENT_NAME, filename) save_mlp(model, path) if SAVE_PCA: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_multi_layer_spec_1_pca_" + str( i + 1) path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(pca, path) if SAVE_RESULTS or SAVE_RESULTS_TXT: average_report = make_report(test_data_classes_list, predicted_list) if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_multi_layer_spec_1_results_averaged" path = os.path.join(EXPERIMENT_NAME, filename) joblib.dump(average_report, path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_multi_layer_spec_1_results_averaged.txt" path = os.path.join(EXPERIMENT_NAME, filename) save_txt(average_report, path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " " + feature + "_mlp_multi_layer_spec_1") print(traceback.format_exc()) pass
def save(self, filename): buffer = np.concatenate(([self.p1, self.m, self.n], self.prior.alpha, self.prior.a, self.prior.b)) utils.save_txt(filename, buffer)
def main(): if(EXPERIMENT_NAME not in os.listdir()): os.mkdir(EXPERIMENT_NAME) # neural networks with single hidden layer for feature in FEATURES: try: data = np.load(feature + "_stats.npy",allow_pickle=True).item() pca = joblib.load("pca_" + feature + "_stats_noval") train_data, train_data_classes = make_train_data(data,False) test_data, test_data_classes = make_test_data(data) val_data, val_data_classes = make_val_data(data) train_data_pca = np.array(pca.transform(train_data)) test_data_pca = np.array(pca.transform(test_data)) val_data_pca = np.array(pca.transform(val_data)) train_data_classes = np.array(train_data_classes) val_data_classes = np.array(val_data_classes) test_data_classes = np.array(test_data_classes) print("Computing " + feature + " single hidden layer neural network") res,model = experiment_mlp_singlelayer(train_data_pca,train_data_classes,val_data_pca,val_data_classes,test_data_pca,test_data_classes,MLP_VERBOSE) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_single_layer_results" path = os.path.join(EXPERIMENT_NAME,filename) joblib.dump(res,path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_single_layer_results.txt" path = os.path.join(EXPERIMENT_NAME,filename) save_txt(res,path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_" + feature + "_mlp_single_layer" path = os.path.join(EXPERIMENT_NAME,filename) save_mlp(model,path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " " + feature + " single hidden layer neural network") print(e) pass # neural networks with multiple hidden layers # welch32 only data = np.load("welch_32_stats.npy",allow_pickle=True).item() pca = joblib.load("pca_welch_32_stats_noval") train_data, train_data_classes = make_train_data(data,False) test_data, test_data_classes = make_test_data(data) val_data, val_data_classes = make_val_data(data) train_data_pca = np.array(pca.transform(train_data)) test_data_pca = np.array(pca.transform(test_data)) val_data_pca = np.array(pca.transform(val_data)) train_data_classes = np.array(train_data_classes) val_data_classes = np.array(val_data_classes) test_data_classes = np.array(test_data_classes) print("Computing welch_32 multiple hidden layer neural network, specification 1:") print("3 hidden layers, LReLU activation (a = 0.02), learning rate = 0.01") print("decay = 1e-6, momentum = 0.9, patience = 50, max epochs = 2000") try: res,model = experiment_mlp_1(train_data_pca,train_data_classes,val_data_pca,val_data_classes,test_data_pca,test_data_classes,MLP_VERBOSE) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_1_results" path = os.path.join(EXPERIMENT_NAME,filename) joblib.dump(res,path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_1_results.txt" path = os.path.join(EXPERIMENT_NAME,filename) save_txt(res,path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_1" path = os.path.join(EXPERIMENT_NAME,filename) save_mlp(model,path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " welch_32 multiple hidden layer neural network, specification 1") print(e) pass print("Computing welch_32 multiple hidden layer neural network, specification 2:") print("4 hidden layers, tanh + LReLU activation (a = 0.02), learning rate = 0.005") print("decay = 1e-6, momentum = 0.9, patience = 250, max epochs = 3000") try: res,model = experiment_mlp_2(train_data_pca,train_data_classes,val_data_pca,val_data_classes,test_data_pca,test_data_classes,MLP_VERBOSE) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_2_results" path = os.path.join(EXPERIMENT_NAME,filename) joblib.dump(res,path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_2_results.txt" path = os.path.join(EXPERIMENT_NAME,filename) save_txt(res,path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_2" path = os.path.join(EXPERIMENT_NAME,filename) save_mlp(model,path) except Exception as e: print("Some problem during " + EXPERIMENT_NAME + " welch_32 multiple hidden layer neural network, specification 2") print(e) pass print("Computing welch_32 multiple hidden layer neural network, specification 3:") print("6 hidden layers, ReLU activation, learning rate = 0.01") print("decay = 1e-6, momentum = 0.9, patience = 70, max epochs = 2000") try: res,model = experiment_mlp_3(train_data_pca,train_data_classes,val_data_pca,val_data_classes,test_data_pca,test_data_classes,MLP_VERBOSE) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_3_results" path = os.path.join(EXPERIMENT_NAME,filename) joblib.dump(res,path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_3_results.txt" path = os.path.join(EXPERIMENT_NAME,filename) save_txt(res,path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_3" path = os.path.join(EXPERIMENT_NAME,filename) save_mlp(model,path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " welch_32 multiple hidden layer neural network, specification 3") print(e) pass print("Computing welch_32 multiple hidden layer neural network, specification 4:") print("3 hidden layers, tanh activation, learning rate = 0.01") print("decay = 1e-6, momentum = 0.9, patience = 250, max epochs = 3000") try: res,model = experiment_mlp_4(train_data_pca,train_data_classes,val_data_pca,val_data_classes,test_data_pca,test_data_classes,MLP_VERBOSE) if res != None: if SAVE_RESULTS: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_4_results" path = os.path.join(EXPERIMENT_NAME,filename) joblib.dump(res,path) if SAVE_RESULTS_TXT: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_4_results.txt" path = os.path.join(EXPERIMENT_NAME,filename) save_txt(res,path) if SAVE_MODEL: filename = EXPERIMENT_NAME + "_welch_32_mlp_multi_layer_spec_4" path = os.path.join(EXPERIMENT_NAME,filename) save_mlp(model,path) except Exception as e: print("Error during " + EXPERIMENT_NAME + " welch_32 multiple hidden layer neural network, specification 4") print(e) pass
if __name__ == "__main__": # Check Dir BASE_DIR = './datas/' if not os.path.exists(BASE_DIR): os.mkdir(BASE_DIR) # Load Data originUrl = read_txt('origin.txt') naverUrl = read_txt('naver.txt') assert len(originUrl) == len(naverUrl) # Pre-processing newsDict : Dict[str, int] = {} for url in originUrl: ''' split('/') => ['https:', '', 'news.joins.com', 'article', 'olink', '23309016'] idx 2 => 뉴스 URI부분 ''' try: newsDict[url.split('/')[2]] += 1 except: newsDict[url.split('/')[2]] = 1 sortedNewsList = sorted(newsDict.items(), key=(lambda v : v[1]), reverse=True) # Cnt 내림차순으로 정렬 # Save for idx, (k, v) in tqdm(enumerate(sortedNewsList)): #print("{} : {}, {}".format(idx,v, k)) # 전체 뉴스가 -> 몇개씩 기사를 가지고있는지 #print("|{}|{}|{}|".format(idx, k, v)) # for README save_txt(BASE_DIR + "{}.txt".format(idx), extract_n(k))