def TFIDF_func(self, data, verbose=False): """ Fungsi ini dapat merubah TF dari suatu dokumen menjadi TF-IDF yang berdasarkan dokumen-dokumen yang tersimpan dalam dokumen cluster """ if verbose: print("pemrosesan TF-IDF") for i in range(len(data)): datum = data[i] if verbose: print("dokumen ke-%d" % i) data[i] = Dataset(self.IDF).leftOuterJoin( Dataset(datum)).map(lambda x: (x[0], x[1][0][0] * x[1][1][ 0] if len(x[1][1]) > 0 else 0)).sortBy(lambda x: x[0]).map( lambda x: x[1]).collect() '''TFIDFs = data query = [] for TFIDF in TFIDFs: TFIDF = Dataset(self.unique_terms).map(lambda x: (x[0], 0)).leftOuterJoin(Dataset(TFIDF)).map(lambda x : (x[0], x[1][1][0] if len(x[1][1])>0 else x[1][0][0])).map(lambda x: x[1]).collect() query.append(TFIDF) if verbose : print(len(TFIDF)) data = query ''' return data
def main(): x = tf.placeholder(tf.float32,shape = [batch_size, sequence_len]) seq_length = tf.placeholder(tf.int32, shape = [batch_size]) y = tf.sparse.placeholder(tf.int32, shape=[batch_size, sequence_len]) logits, ratio = inference(x, seq_length) ctc_loss = loss(logits,seq_length,y) opt = train_step(ctc_loss) error = prediction(logits,seq_length,y) init = tf.global_variables_initializer() saver = tf.train.Saver() summary = tf.summary.merge_all() sess = tf.Session(config = tf.ConfigProto(allow_soft_placement=True)) dataset = Dataset(sys.argv[1]) train, test = dataset.train_test_split() signal_seq = ExampleSequence(dataset, train, name='train', batch_size=batch_size) test_seq = ExampleSequence(dataset, test, name='test', batch_size=batch_size) val_batch = 0 for i in range(len(signal_seq)): example = test_seq[i] feed_dict = {x:example['the_input'],seq_length:example['input_length'],y:get_sparse(example)} loss_val,_ = sess.run([ctc_loss,opt],feed_dict = feed_dict) if i%10 ==0: example = test_seq[val_batch] val_batch = val_batch + 1 feed_dict = {x:example['the_input'],seq_length:example['input_length'],y:get_sparse(example)} error_val = sess.run(error,feed_dict = feed_dict) print("Epoch %d, batch number %d, loss: %5.3f edit_distance: %5.3f"%(0,i,loss_val,error_val))
def load_from_file(folder): cluster = Dokumen_Cluster(None) print("loaing UNIQUE TERM") unique = open(str.format("%s/unique"%folder), 'r').readlines() un = [] for u in unique: a = ((u.strip().split(','))) a[1] = int(a[1]) un.append(a) cluster.unique_terms = Dataset(un) print("loading IDF") idf = open(str.format("%s/IDF"%folder), 'r').readlines() un = [] for u in idf: a = ((u.strip().split(','))) a[1] = float(a[1]) un.append(a) cluster.IDF = un print("loading TFIDF") tfidf = open(str.format("%s/TFIDF"%folder), 'r').readlines() un = [] for u in tfidf: a = ((u.strip().split(',')))[:-1] a = [float(b) for b in a] un.append(a) cluster.TFIDF = un return cluster
def main(data_path, model_path, epochs): with open(os.path.join(model_path, 'train.txt')) as train_file: train = [x.strip() for x in train_file.readlines()] with open(os.path.join(model_path, 'test.txt')) as test_file: test = [x.strip() for x in test_file.readlines()] csv_logger = CSVLogger(os.path.join(model_path, 'Log1.csv')) dataset = Dataset(data_path) signal_seq = ExampleSequence(dataset, train, name='train', batch_size=batch_size) test_seq = ExampleSequence(dataset, test, name='test', batch_size=batch_size) model = load_model(os.path.join(model_path, 'model.h5'), custom_objects={ '<lambda>': lambda y_true, y_pred: y_pred }) model = multi_gpu_model(model, gpus=2) param = { 'lr': 0.001, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': None, 'clipvalue': 2 } adam = optimizers.Adam(**param) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) model.fit_generator(signal_seq, validation_data=test_seq, epochs=epochs, callbacks=[csv_logger]) model.save(os.path.join(model_path, 'model_1.h5')) sub_model = model.get_layer('model_2') sub_model = sub_model.get_layer('model_1') im_model = Model(inputs=sub_model.get_input_at(0), outputs=sub_model.get_layer('activation_1').output) dists = [] ops = [] lens = [] pred_lens = [] real = [] predicted = [] for j in range(len(test_seq)): batch = test_seq[j][0] preds = im_model.predict_on_batch(batch) val = K.ctc_decode(preds, np.full(batch_size, batch['input_length'][0, 0]), greedy=False) decoded = K.eval(val[0][0]) for i in range(decoded.shape[0]): real_label = batch['the_labels'][i, :batch['label_length'][i, 0]] real_label = ''.join([str(int(x)) for x in real_label.tolist()]) pred_label = list(filter(lambda x: x != -1, decoded[i, :].tolist())) pred_label = [str(x) for x in pred_label] pred_label = ''.join(pred_label) dists.append(distance(pred_label, real_label)) ops.append(editops(pred_label, real_label)) lens.append(len(real_label)) pred_lens.append(len(pred_label)) real.append(real_label) predicted.append(pred_label) op_counts = {'insert': 0, 'replace': 0, 'delete': 0} for op in ops: for x in op: op_counts[x[0]] += 1 for key in op_counts.keys(): op_counts[key] = op_counts[key] / sum(lens) metrics = { 'LER': sum(dists) / sum(lens), 'real_mean_length': np.mean(lens), 'predicted_mean_length': np.mean(pred_lens) } metrics.update(op_counts) metrics_file_path = os.path.join(model_path, 'metrics_continue.json') write_dict_to_file(metrics_file_path, metrics)
from FCN_Model import encoder, decoder from UNet_Model import UNet import torch import torch.nn as nn from torch.utils.data import DataLoader import torch.optim as optim import cv2 import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import math image_dir = "Stanford background dataset\images" label_dir = "Stanford background dataset\labels" dataset = Dataset(image_dir, label_dir) train_size = round(len(dataset) * 0.9) # 90 % for training test_size = len(dataset) - train_size # 10 % for testing train_data, test_data = torch.utils.data.random_split( dataset, [train_size, test_size], generator=torch.manual_seed(42)) train_loader = DataLoader(train_data, batch_size=2, shuffle=True, num_workers=0) test_loader = DataLoader(test_data, batch_size=1, shuffle=True, num_workers=0) def save_checkpoint(state, filename='checkpoint.pth.tar'): torch.save(state, filename)
def TF_func(self, data, lemma = None, stopword=None, stemmer=None, verbose = False): ''' Fungsi ini digunakan untuk melakukan preprocessing text dengan metode Bag of Words lemma : lemma adalah dictionary yang memiliki struktur [[(key->value),...],...]. stopword : stopword adalah array yang memiliki struktur [[value, ...], ...] stemmer : stemmer merupakan array yang memiliki struktur [_function, ...] verbose : verbose merupakan variable yang menjadi flag log dari class ''' if verbose: print("pemrosesan TF") #prepare lemma dictionary = [] if not lemma == None: for dicti in lemma: dictionary.append(Dataset(dicti)) #prepare stopword sw = [] if not stopword == None: for stop in stopword: sw.append(Dataset(stop).map(x1)) #prepare return value ret = [] #Calculate TF for datum in data: #Cleaning and Tokenizing datum = Dataset(datum).map(lambda x : re.sub('[^a-zA-Z ]+','', x).lower()).flatMap(lambda x: x.strip().split()).map(x1) #Lemmantization if not lemma == None: for dicti in dictionary: datum = datum.leftOuterJoin(dicti).map(lambda x : (x[1][1][0] if len(x[1][1])>0 else x[0], 1)) #Stemming if not stemmer == None : datum = datum.map(lambda x : x[0]).collect() for stemmer_func in stemmer : datum = stemmer_func(datum) datum = Dataset(datum).map(x1) #stopword if not stopword == None : for stop in sw: datum = datum.leftOuterJoin(stop).filter(lambda x : len(x[1][1])==0).map(lambda x:(x[0],1)) #Logging (Wtf) datum = datum.reduceByKey(add).map(lambda x : (x[0], 1+math.log(x[1]))) ret.append(datum.collect()) return ret
def __init__(self, data, lemma=None, stopword=None, stemmer=None, verbose = False): """ Fungsi init adalah fungsi untuk menginisialisasi Dokumen Cluster yang dapat digunakan untuk melakukan preprocessing text secara Bag of Words. Fungsi init akan secara otomatis melakukan preprocessing (eager) """ self. TFIDF = [] self.unique_terms = Dataset([]) self.count = 0 self.IDF = None if data is None: return #TF data = Dataset(self.TF_func(data, lemma, stopword, stemmer, verbose)) #catching unique term for datum in data.collect(): datum = Dataset(datum) unique_term = datum.reduceByKey(lambda x,y: 1).map(lambda x : (x[0], 1)) self.unique_terms = self.unique_terms.append(unique_term).reduceByKey(add) #Assigning unique Terms (used in making query) self.unique_terms = self.unique_terms.collect() self.count = data.count() #IDF if verbose: print("pemrosesan IDF") self.IDF = Dataset(self.unique_terms).map(lambda x : (x[0],math.log(self.count/x[1]))).collect() #TF-IDF data = data.collect() data = self.TFIDF_func(data, verbose = True) #Adding to Dokumens self.TFIDF = data with concurrent.futures.ThreadPoolExecutor() as executor: self.TFIDF = np.array(list(executor.map(lambda x : x/np.linalg.norm(x), self.TFIDF)))
def getUniqueTerms(self): """ Fungsi ini digunakan untuk mengambil Unique Term dari class """ return Dataset(self.unique_terms).map(lambda x : x[0]).collect()
class Dokumen_Cluster: '''TFIDF = [] unique_terms = Dataset([]) count = 0 IDF = None''' def __init__(self, data, lemma=None, stopword=None, stemmer=None, verbose = False): """ Fungsi init adalah fungsi untuk menginisialisasi Dokumen Cluster yang dapat digunakan untuk melakukan preprocessing text secara Bag of Words. Fungsi init akan secara otomatis melakukan preprocessing (eager) """ self. TFIDF = [] self.unique_terms = Dataset([]) self.count = 0 self.IDF = None if data is None: return #TF data = Dataset(self.TF_func(data, lemma, stopword, stemmer, verbose)) #catching unique term for datum in data.collect(): datum = Dataset(datum) unique_term = datum.reduceByKey(lambda x,y: 1).map(lambda x : (x[0], 1)) self.unique_terms = self.unique_terms.append(unique_term).reduceByKey(add) #Assigning unique Terms (used in making query) self.unique_terms = self.unique_terms.collect() self.count = data.count() #IDF if verbose: print("pemrosesan IDF") self.IDF = Dataset(self.unique_terms).map(lambda x : (x[0],math.log(self.count/x[1]))).collect() #TF-IDF data = data.collect() data = self.TFIDF_func(data, verbose = True) #Adding to Dokumens self.TFIDF = data with concurrent.futures.ThreadPoolExecutor() as executor: self.TFIDF = np.array(list(executor.map(lambda x : x/np.linalg.norm(x), self.TFIDF))) def TFIDF_func(self, data, verbose = False): """ Fungsi ini dapat merubah TF dari suatu dokumen menjadi TF-IDF yang berdasarkan dokumen-dokumen yang tersimpan dalam dokumen cluster """ if verbose: print("pemrosesan TF-IDF") for i in range(len(data)): datum = data[i] if verbose: print("dokumen ke-%d"%i) data[i] = Dataset(self.IDF).leftOuterJoin(Dataset(datum)).map(lambda x : (x[0], x[1][0][0]*x[1][1][0] if len(x[1][1])>0 else 0)).sortBy(lambda x : x[0]).map(lambda x:x[1]).collect() '''TFIDFs = data query = [] for TFIDF in TFIDFs: TFIDF = Dataset(self.unique_terms).map(lambda x: (x[0], 0)).leftOuterJoin(Dataset(TFIDF)).map(lambda x : (x[0], x[1][1][0] if len(x[1][1])>0 else x[1][0][0])).map(lambda x: x[1]).collect() query.append(TFIDF) if verbose : print(len(TFIDF)) data = query ''' return data def TF_func(self, data, lemma = None, stopword=None, stemmer=None, verbose = False): ''' Fungsi ini digunakan untuk melakukan preprocessing text dengan metode Bag of Words lemma : lemma adalah dictionary yang memiliki struktur [[(key->value),...],...]. stopword : stopword adalah array yang memiliki struktur [[value, ...], ...] stemmer : stemmer merupakan array yang memiliki struktur [_function, ...] verbose : verbose merupakan variable yang menjadi flag log dari class ''' if verbose: print("pemrosesan TF") #prepare lemma dictionary = [] if not lemma == None: for dicti in lemma: dictionary.append(Dataset(dicti)) #prepare stopword sw = [] if not stopword == None: for stop in stopword: sw.append(Dataset(stop).map(x1)) #prepare return value ret = [] #Calculate TF for datum in data: #Cleaning and Tokenizing datum = Dataset(datum).map(lambda x : re.sub('[^a-zA-Z ]+','', x).lower()).flatMap(lambda x: x.strip().split()).map(x1) #Lemmantization if not lemma == None: for dicti in dictionary: datum = datum.leftOuterJoin(dicti).map(lambda x : (x[1][1][0] if len(x[1][1])>0 else x[0], 1)) #Stemming if not stemmer == None : datum = datum.map(lambda x : x[0]).collect() for stemmer_func in stemmer : datum = stemmer_func(datum) datum = Dataset(datum).map(x1) #stopword if not stopword == None : for stop in sw: datum = datum.leftOuterJoin(stop).filter(lambda x : len(x[1][1])==0).map(lambda x:(x[0],1)) #Logging (Wtf) datum = datum.reduceByKey(add).map(lambda x : (x[0], 1+math.log(x[1]))) ret.append(datum.collect()) return ret def getQuery(self, data, lemma=None, stopword=None, stemmer=None): """ Fungsi ini digunakan untuk memperoleh TFIDF dari dokumens berdasarkan dokumens yang tersimpan dalam class """ return self.TFIDF_func(self.TF_func(data, lemma, stopword, stemmer)) def getUniqueTerms(self): """ Fungsi ini digunakan untuk mengambil Unique Term dari class """ return Dataset(self.unique_terms).map(lambda x : x[0]).collect() def toFile(self, folder): """ Fungsi ini digunakan untuk menyimpan hasil preprocessing ke dokumen """ try: os.makedirs(folder) except : pass print("saving TFIDF") with open(folder+'/TFIDF', 'w') as file: for TF in self.TFIDF: for term in TF : file.write(str(term)+',') file.write('\n') print("saving IDF") with open(folder+'/IDF', 'w') as file: for I in self.IDF: file.write(str(I[0])+','+str(I[1])) file.write('\n') print("saving UNIQUE TERM") with open(folder+'/unique', 'w') as file: for I in self.unique_terms: file.write(str.format("%s, %d"%(I[0], I[1]))) file.write('\n')
def main(data_path, epochs): run_start_time = str(datetime.datetime.now().isoformat()) dataset = Dataset(data_path) train, test = dataset.train_test_split() signal_seq = ExampleSequence(dataset, train, name='train') test_seq = ExampleSequence(dataset, test, name='test') os.mkdir('runs/' + run_start_time) log_dir = os.path.join('runs', run_start_time) write_lines_to_file(os.path.join(log_dir, 'test.txt'), test) write_lines_to_file(os.path.join(log_dir, 'train.txt'), train) csv_logger = CSVLogger(os.path.join(log_dir, 'Log.csv')) model = get_default_model() model = multi_gpu_model(model, gpus=2) param = { 'lr': 0.001, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': None, 'decay': 0.001 } param_file_path = os.path.join(log_dir, 'params.json') adam = optimizers.Adam(**param) param.update({'data_path': data_path, 'epochs': epochs}) write_dict_to_file(param_file_path, param) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) model.fit_generator(signal_seq, validation_data=test_seq, epochs=epochs, callbacks=[csv_logger]) model.save(os.path.join(log_dir, 'model.h5')) sub_model = model.get_layer('model_1') im_model = Model(inputs=sub_model.get_input_at(0), outputs=sub_model.get_layer('activation_1').output) dists = [] ops = [] lens = [] pred_lens = [] real = [] predicted = [] for j in range(len(test_seq)): batch = test_seq[j][0] preds = im_model.predict_on_batch(batch) val = K.ctc_decode(preds, np.full(150, batch['input_length'][0, 0]), greedy=False) decoded = K.eval(val[0][0]) for i in range(decoded.shape[0]): real_label = batch['the_labels'][i, :batch['label_length'][i, 0]] real_label = ''.join([str(int(x)) for x in real_label.tolist()]) pred_label = list(filter(lambda x: x != -1, decoded[i, :].tolist())) pred_label = [str(x) for x in pred_label] pred_label = ''.join(pred_label) dists.append(distance(pred_label, real_label)) ops.append(editops(pred_label, real_label)) lens.append(len(real_label)) pred_lens.append(len(pred_label)) real.append(real_label) predicted.append(pred_label) op_counts = {'insert': 0, 'replace': 0, 'delete': 0} for op in ops: for x in op: op_counts[x[0]] += 1 for key in op_counts.keys(): op_counts[key] = op_counts[key] / sum(lens) metrics = { 'LER': sum(dists) / sum(lens), 'real_mean_length': np.mean(lens), 'predicted_mean_length': np.mean(pred_lens) } metrics.update(op_counts) metrics_file_path = os.path.join(log_dir, 'metrics.json') write_dict_to_file(metrics_file_path, metrics) plot_model(im_model, to_file=os.path.join(log_dir, 'model.png'))