Exemplo n.º 1
0
 def TFIDF_func(self, data, verbose=False):
     """
     Fungsi ini dapat merubah TF dari suatu dokumen menjadi TF-IDF yang berdasarkan dokumen-dokumen yang tersimpan dalam
     dokumen cluster
     """
     if verbose:
         print("pemrosesan TF-IDF")
     for i in range(len(data)):
         datum = data[i]
         if verbose:
             print("dokumen ke-%d" % i)
         data[i] = Dataset(self.IDF).leftOuterJoin(
             Dataset(datum)).map(lambda x: (x[0], x[1][0][0] * x[1][1][
                 0] if len(x[1][1]) > 0 else 0)).sortBy(lambda x: x[0]).map(
                     lambda x: x[1]).collect()
     '''TFIDFs = data
     query = []
     for TFIDF in TFIDFs:
         TFIDF = Dataset(self.unique_terms).map(lambda x: (x[0], 0)).leftOuterJoin(Dataset(TFIDF)).map(lambda x : (x[0], x[1][1][0] if len(x[1][1])>0 else x[1][0][0])).map(lambda x: x[1]).collect()
         query.append(TFIDF)
     if verbose :
             print(len(TFIDF))
     data = query
     '''
     return data
Exemplo n.º 2
0
def main():
    x = tf.placeholder(tf.float32,shape = [batch_size, sequence_len])
    seq_length = tf.placeholder(tf.int32, shape = [batch_size])
    y = tf.sparse.placeholder(tf.int32, shape=[batch_size, sequence_len])
    logits, ratio = inference(x, seq_length)
    ctc_loss = loss(logits,seq_length,y)
    opt = train_step(ctc_loss)
    error = prediction(logits,seq_length,y)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    summary = tf.summary.merge_all()
    sess = tf.Session(config = tf.ConfigProto(allow_soft_placement=True))
    dataset = Dataset(sys.argv[1])
    train, test = dataset.train_test_split()
    signal_seq = ExampleSequence(dataset, train, name='train', batch_size=batch_size)
    test_seq = ExampleSequence(dataset, test, name='test', batch_size=batch_size)
    val_batch = 0
    for i in range(len(signal_seq)):
        example = test_seq[i]
        feed_dict =  {x:example['the_input'],seq_length:example['input_length'],y:get_sparse(example)}
        loss_val,_ = sess.run([ctc_loss,opt],feed_dict = feed_dict)
        if i%10 ==0:
            example = test_seq[val_batch]
            val_batch = val_batch + 1
            feed_dict = {x:example['the_input'],seq_length:example['input_length'],y:get_sparse(example)}
            error_val = sess.run(error,feed_dict = feed_dict)
            print("Epoch %d, batch number %d, loss: %5.3f edit_distance: %5.3f"%(0,i,loss_val,error_val))
Exemplo n.º 3
0
def load_from_file(folder):
    cluster = Dokumen_Cluster(None)
    print("loaing UNIQUE TERM")
    unique = open(str.format("%s/unique"%folder), 'r').readlines()
    un = []
    for u in unique:
        a = ((u.strip().split(',')))
        a[1] = int(a[1])
        un.append(a)
    cluster.unique_terms = Dataset(un)
    print("loading IDF")
    idf = open(str.format("%s/IDF"%folder), 'r').readlines()
    un = []
    for u in idf:
        a = ((u.strip().split(',')))
        a[1] = float(a[1])
        un.append(a)
    cluster.IDF = un
    print("loading TFIDF")
    tfidf = open(str.format("%s/TFIDF"%folder), 'r').readlines()
    un = []
    for u in tfidf:
        a = ((u.strip().split(',')))[:-1]
        a = [float(b) for b in a]
        un.append(a)
    cluster.TFIDF = un
    return cluster
Exemplo n.º 4
0
def main(data_path, model_path, epochs):
    with open(os.path.join(model_path, 'train.txt')) as train_file:
        train = [x.strip() for x in train_file.readlines()]
    with open(os.path.join(model_path, 'test.txt')) as test_file:
        test = [x.strip() for x in test_file.readlines()]
    csv_logger = CSVLogger(os.path.join(model_path, 'Log1.csv'))
    dataset = Dataset(data_path)
    signal_seq = ExampleSequence(dataset,
                                 train,
                                 name='train',
                                 batch_size=batch_size)
    test_seq = ExampleSequence(dataset,
                               test,
                               name='test',
                               batch_size=batch_size)
    model = load_model(os.path.join(model_path, 'model.h5'),
                       custom_objects={
                           '<lambda>': lambda y_true, y_pred: y_pred
                       })
    model = multi_gpu_model(model, gpus=2)
    param = {
        'lr': 0.001,
        'beta_1': 0.9,
        'beta_2': 0.999,
        'epsilon': None,
        'clipvalue': 2
    }
    adam = optimizers.Adam(**param)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
    model.fit_generator(signal_seq,
                        validation_data=test_seq,
                        epochs=epochs,
                        callbacks=[csv_logger])
    model.save(os.path.join(model_path, 'model_1.h5'))
    sub_model = model.get_layer('model_2')
    sub_model = sub_model.get_layer('model_1')
    im_model = Model(inputs=sub_model.get_input_at(0),
                     outputs=sub_model.get_layer('activation_1').output)
    dists = []
    ops = []
    lens = []
    pred_lens = []
    real = []
    predicted = []
    for j in range(len(test_seq)):
        batch = test_seq[j][0]
        preds = im_model.predict_on_batch(batch)
        val = K.ctc_decode(preds,
                           np.full(batch_size, batch['input_length'][0, 0]),
                           greedy=False)
        decoded = K.eval(val[0][0])
        for i in range(decoded.shape[0]):
            real_label = batch['the_labels'][i, :batch['label_length'][i, 0]]
            real_label = ''.join([str(int(x)) for x in real_label.tolist()])
            pred_label = list(filter(lambda x: x != -1,
                                     decoded[i, :].tolist()))
            pred_label = [str(x) for x in pred_label]
            pred_label = ''.join(pred_label)
            dists.append(distance(pred_label, real_label))
            ops.append(editops(pred_label, real_label))
            lens.append(len(real_label))
            pred_lens.append(len(pred_label))
            real.append(real_label)
            predicted.append(pred_label)
    op_counts = {'insert': 0, 'replace': 0, 'delete': 0}
    for op in ops:
        for x in op:
            op_counts[x[0]] += 1
    for key in op_counts.keys():
        op_counts[key] = op_counts[key] / sum(lens)
    metrics = {
        'LER': sum(dists) / sum(lens),
        'real_mean_length': np.mean(lens),
        'predicted_mean_length': np.mean(pred_lens)
    }
    metrics.update(op_counts)
    metrics_file_path = os.path.join(model_path, 'metrics_continue.json')
    write_dict_to_file(metrics_file_path, metrics)
Exemplo n.º 5
0
from FCN_Model import encoder, decoder
from UNet_Model import UNet
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import cv2
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

image_dir = "Stanford background dataset\images"
label_dir = "Stanford background dataset\labels"
dataset = Dataset(image_dir, label_dir)
train_size = round(len(dataset) * 0.9)  # 90 % for training
test_size = len(dataset) - train_size  # 10 % for testing
train_data, test_data = torch.utils.data.random_split(
    dataset, [train_size, test_size], generator=torch.manual_seed(42))
train_loader = DataLoader(train_data,
                          batch_size=2,
                          shuffle=True,
                          num_workers=0)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True, num_workers=0)


def save_checkpoint(state, filename='checkpoint.pth.tar'):
    torch.save(state, filename)

Exemplo n.º 6
0
 def TF_func(self, data, lemma = None, stopword=None, stemmer=None, verbose = False):
     '''
     Fungsi ini digunakan untuk melakukan preprocessing text dengan metode Bag of Words
     lemma : lemma adalah dictionary yang memiliki struktur [[(key->value),...],...]. 
     stopword : stopword adalah array yang memiliki struktur [[value, ...], ...]
     stemmer : stemmer merupakan array yang memiliki struktur [_function, ...]
     verbose : verbose merupakan variable yang menjadi flag log dari class
     '''
     if verbose:
         print("pemrosesan TF")
     #prepare lemma
     dictionary = []
     if not lemma == None:
         for dicti in lemma:
             dictionary.append(Dataset(dicti))
     #prepare stopword
     sw = []
     if not stopword == None:
         for stop in stopword:
             sw.append(Dataset(stop).map(x1))
     #prepare return value
     ret = []
     #Calculate TF
     for datum in data:
         #Cleaning and Tokenizing
         datum = Dataset(datum).map(lambda x : re.sub('[^a-zA-Z ]+','', x).lower()).flatMap(lambda x: x.strip().split()).map(x1)
         #Lemmantization
         if not lemma == None:
             for dicti in dictionary:
                 datum = datum.leftOuterJoin(dicti).map(lambda x : (x[1][1][0] if len(x[1][1])>0 else x[0], 1))
         #Stemming
         if not stemmer == None :
             datum = datum.map(lambda x : x[0]).collect()
             for stemmer_func in stemmer :          
                 datum = stemmer_func(datum)
             datum = Dataset(datum).map(x1)
         #stopword
         if not stopword == None :
             for stop in sw:
                 datum = datum.leftOuterJoin(stop).filter(lambda x : len(x[1][1])==0).map(lambda x:(x[0],1))
         #Logging (Wtf)
         datum = datum.reduceByKey(add).map(lambda x : (x[0], 1+math.log(x[1])))    
         ret.append(datum.collect())
     return ret
Exemplo n.º 7
0
 def __init__(self, data, lemma=None, stopword=None, stemmer=None, verbose = False):
     """
     Fungsi init adalah fungsi untuk menginisialisasi Dokumen Cluster yang dapat digunakan untuk melakukan
     preprocessing text secara Bag of Words. Fungsi init akan secara otomatis melakukan preprocessing (eager)
     """
     self. TFIDF = []
     self.unique_terms = Dataset([])
     self.count = 0
     self.IDF = None
     if data is None:
         return
     #TF
     data = Dataset(self.TF_func(data, lemma, stopword, stemmer, verbose))
     #catching unique term
     for datum in data.collect():
         datum = Dataset(datum)
         unique_term = datum.reduceByKey(lambda x,y: 1).map(lambda x : (x[0], 1))
         self.unique_terms = self.unique_terms.append(unique_term).reduceByKey(add)
     #Assigning unique Terms (used in making query)
     self.unique_terms = self.unique_terms.collect()
     self.count = data.count()
     #IDF
     if verbose:
         print("pemrosesan IDF")
     self.IDF = Dataset(self.unique_terms).map(lambda x : (x[0],math.log(self.count/x[1]))).collect()
     #TF-IDF
     data = data.collect()
     data = self.TFIDF_func(data, verbose = True)
     #Adding to Dokumens
     self.TFIDF = data
     with concurrent.futures.ThreadPoolExecutor() as executor:
          self.TFIDF = np.array(list(executor.map(lambda x : x/np.linalg.norm(x),  self.TFIDF)))
Exemplo n.º 8
0
 def getUniqueTerms(self):
     """
     Fungsi ini digunakan untuk mengambil Unique Term dari class
     """
     return Dataset(self.unique_terms).map(lambda x : x[0]).collect()
Exemplo n.º 9
0
class Dokumen_Cluster:
    '''TFIDF = []
    unique_terms = Dataset([])
    count = 0
    IDF = None'''
    
    def __init__(self, data, lemma=None, stopword=None, stemmer=None, verbose = False):
        """
        Fungsi init adalah fungsi untuk menginisialisasi Dokumen Cluster yang dapat digunakan untuk melakukan
        preprocessing text secara Bag of Words. Fungsi init akan secara otomatis melakukan preprocessing (eager)
        """
        self. TFIDF = []
        self.unique_terms = Dataset([])
        self.count = 0
        self.IDF = None
        if data is None:
            return
        #TF
        data = Dataset(self.TF_func(data, lemma, stopword, stemmer, verbose))
        #catching unique term
        for datum in data.collect():
            datum = Dataset(datum)
            unique_term = datum.reduceByKey(lambda x,y: 1).map(lambda x : (x[0], 1))
            self.unique_terms = self.unique_terms.append(unique_term).reduceByKey(add)
        #Assigning unique Terms (used in making query)
        self.unique_terms = self.unique_terms.collect()
        self.count = data.count()
        #IDF
        if verbose:
            print("pemrosesan IDF")
        self.IDF = Dataset(self.unique_terms).map(lambda x : (x[0],math.log(self.count/x[1]))).collect()
        #TF-IDF
        data = data.collect()
        data = self.TFIDF_func(data, verbose = True)
        #Adding to Dokumens
        self.TFIDF = data
        with concurrent.futures.ThreadPoolExecutor() as executor:
             self.TFIDF = np.array(list(executor.map(lambda x : x/np.linalg.norm(x),  self.TFIDF)))
        
        
    def TFIDF_func(self, data, verbose = False):
        """
        Fungsi ini dapat merubah TF dari suatu dokumen menjadi TF-IDF yang berdasarkan dokumen-dokumen yang tersimpan dalam
        dokumen cluster
        """
        if verbose:
            print("pemrosesan TF-IDF")
        for i in range(len(data)):
            datum = data[i]
            if verbose:
                print("dokumen ke-%d"%i)
            data[i] = Dataset(self.IDF).leftOuterJoin(Dataset(datum)).map(lambda x : (x[0], x[1][0][0]*x[1][1][0] if len(x[1][1])>0 else 0)).sortBy(lambda x : x[0]).map(lambda x:x[1]).collect()
        '''TFIDFs = data
        query = []
        for TFIDF in TFIDFs:
            TFIDF = Dataset(self.unique_terms).map(lambda x: (x[0], 0)).leftOuterJoin(Dataset(TFIDF)).map(lambda x : (x[0], x[1][1][0] if len(x[1][1])>0 else x[1][0][0])).map(lambda x: x[1]).collect()
            query.append(TFIDF)
        if verbose :
                print(len(TFIDF))
        data = query
        '''
        return data
    def TF_func(self, data, lemma = None, stopword=None, stemmer=None, verbose = False):
        '''
        Fungsi ini digunakan untuk melakukan preprocessing text dengan metode Bag of Words
        lemma : lemma adalah dictionary yang memiliki struktur [[(key->value),...],...]. 
        stopword : stopword adalah array yang memiliki struktur [[value, ...], ...]
        stemmer : stemmer merupakan array yang memiliki struktur [_function, ...]
        verbose : verbose merupakan variable yang menjadi flag log dari class
        '''
        if verbose:
            print("pemrosesan TF")
        #prepare lemma
        dictionary = []
        if not lemma == None:
            for dicti in lemma:
                dictionary.append(Dataset(dicti))
        #prepare stopword
        sw = []
        if not stopword == None:
            for stop in stopword:
                sw.append(Dataset(stop).map(x1))
        #prepare return value
        ret = []
        #Calculate TF
        for datum in data:
            #Cleaning and Tokenizing
            datum = Dataset(datum).map(lambda x : re.sub('[^a-zA-Z ]+','', x).lower()).flatMap(lambda x: x.strip().split()).map(x1)
            #Lemmantization
            if not lemma == None:
                for dicti in dictionary:
                    datum = datum.leftOuterJoin(dicti).map(lambda x : (x[1][1][0] if len(x[1][1])>0 else x[0], 1))
            #Stemming
            if not stemmer == None :
                datum = datum.map(lambda x : x[0]).collect()
                for stemmer_func in stemmer :          
                    datum = stemmer_func(datum)
                datum = Dataset(datum).map(x1)
            #stopword
            if not stopword == None :
                for stop in sw:
                    datum = datum.leftOuterJoin(stop).filter(lambda x : len(x[1][1])==0).map(lambda x:(x[0],1))
            #Logging (Wtf)
            datum = datum.reduceByKey(add).map(lambda x : (x[0], 1+math.log(x[1])))    
            ret.append(datum.collect())
        return ret
    
    def getQuery(self, data, lemma=None, stopword=None, stemmer=None):
        """
        Fungsi ini digunakan untuk memperoleh TFIDF dari dokumens berdasarkan dokumens yang tersimpan dalam class
        """
        return self.TFIDF_func(self.TF_func(data, lemma, stopword, stemmer))
    
    def getUniqueTerms(self):
        """
        Fungsi ini digunakan untuk mengambil Unique Term dari class
        """
        return Dataset(self.unique_terms).map(lambda x : x[0]).collect()
    
    def toFile(self, folder):
        """
        Fungsi ini digunakan untuk menyimpan hasil preprocessing ke dokumen
        """
        try:
            os.makedirs(folder)
        except :
            pass
        print("saving TFIDF")
        with open(folder+'/TFIDF', 'w') as file:
            for TF in self.TFIDF:
                    for term in TF :   
                        file.write(str(term)+',')
                    file.write('\n')
        print("saving IDF")
        with open(folder+'/IDF', 'w') as file:
            for I in self.IDF:
                file.write(str(I[0])+','+str(I[1]))
                file.write('\n')
        print("saving UNIQUE TERM")
        with open(folder+'/unique', 'w') as file:
            for I in self.unique_terms:
                file.write(str.format("%s, %d"%(I[0], I[1])))
                file.write('\n')
Exemplo n.º 10
0
def main(data_path, epochs):
    run_start_time = str(datetime.datetime.now().isoformat())
    dataset = Dataset(data_path)
    train, test = dataset.train_test_split()
    signal_seq = ExampleSequence(dataset, train, name='train')
    test_seq = ExampleSequence(dataset, test, name='test')
    os.mkdir('runs/' + run_start_time)
    log_dir = os.path.join('runs', run_start_time)
    write_lines_to_file(os.path.join(log_dir, 'test.txt'), test)
    write_lines_to_file(os.path.join(log_dir, 'train.txt'), train)
    csv_logger = CSVLogger(os.path.join(log_dir, 'Log.csv'))
    model = get_default_model()
    model = multi_gpu_model(model, gpus=2)
    param = {
        'lr': 0.001,
        'beta_1': 0.9,
        'beta_2': 0.999,
        'epsilon': None,
        'decay': 0.001
    }
    param_file_path = os.path.join(log_dir, 'params.json')
    adam = optimizers.Adam(**param)
    param.update({'data_path': data_path, 'epochs': epochs})
    write_dict_to_file(param_file_path, param)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
    model.fit_generator(signal_seq,
                        validation_data=test_seq,
                        epochs=epochs,
                        callbacks=[csv_logger])
    model.save(os.path.join(log_dir, 'model.h5'))
    sub_model = model.get_layer('model_1')
    im_model = Model(inputs=sub_model.get_input_at(0),
                     outputs=sub_model.get_layer('activation_1').output)
    dists = []
    ops = []
    lens = []
    pred_lens = []
    real = []
    predicted = []
    for j in range(len(test_seq)):
        batch = test_seq[j][0]
        preds = im_model.predict_on_batch(batch)
        val = K.ctc_decode(preds,
                           np.full(150, batch['input_length'][0, 0]),
                           greedy=False)
        decoded = K.eval(val[0][0])
        for i in range(decoded.shape[0]):
            real_label = batch['the_labels'][i, :batch['label_length'][i, 0]]
            real_label = ''.join([str(int(x)) for x in real_label.tolist()])
            pred_label = list(filter(lambda x: x != -1,
                                     decoded[i, :].tolist()))
            pred_label = [str(x) for x in pred_label]
            pred_label = ''.join(pred_label)
            dists.append(distance(pred_label, real_label))
            ops.append(editops(pred_label, real_label))
            lens.append(len(real_label))
            pred_lens.append(len(pred_label))
            real.append(real_label)
            predicted.append(pred_label)
    op_counts = {'insert': 0, 'replace': 0, 'delete': 0}
    for op in ops:
        for x in op:
            op_counts[x[0]] += 1
    for key in op_counts.keys():
        op_counts[key] = op_counts[key] / sum(lens)
    metrics = {
        'LER': sum(dists) / sum(lens),
        'real_mean_length': np.mean(lens),
        'predicted_mean_length': np.mean(pred_lens)
    }
    metrics.update(op_counts)
    metrics_file_path = os.path.join(log_dir, 'metrics.json')
    write_dict_to_file(metrics_file_path, metrics)
    plot_model(im_model, to_file=os.path.join(log_dir, 'model.png'))