Пример #1
0
class Vocabulary():
    def __init__(self, model):
        self.preprocessing = Preprocessing(model)
        self.model = model

    def read_config_vocabulary(self, keyword):
        config = {'vocab_size': 0, 'data_count': 0}

        result = self.model.select_vocab_config(keyword)

        result = result if result else config

        return result

    def write_config_vocabulary(self, vocab_size, data_count, keyword):
        result = self.model.save_vocab_config(vocab_size, data_count, keyword)

    def write_vocabulary(self, list_word, list_idf, list_total, keyword):

        self.model.save_vocabulary(list_word, list_idf, list_total, keyword)

    def read_vocabulary(self, keyword):
        vocabulary = self.model.select_vocabulary(keyword)

        return vocabulary

    def create_vocabulary(self, data, keyword):
        vocabulary = self.read_vocabulary(keyword)

        for words in data:
            words_token = self.preprocessing.tokennizing(words)
            for word in words_token:
                if word not in vocabulary:
                    vocabulary.append(word)

        vocabulary_size = len(vocabulary)
        data_count = len(data)

        self.write_config_vocabulary(vocabulary_size, data_count, keyword)

        return vocabulary
Пример #2
0
def backgroundprocess_preprocessing(keyword, data_preprocessing, username):
    model = Model()
    tfidf = TfIdf(model)
    preprocessing = Preprocessing(model)
    list_id_tweet = []
    list_tweet_stemming = []
    list_tfidf = []

    try:

        model.delete_clustered(keyword)

        for data_tweet in data_preprocessing:
            id_tweet, tweet_stemming = background_preprocessing(
                data_tweet, keyword, preprocessing, model)
            list_id_tweet.append(id_tweet)
            list_tweet_stemming.append(tweet_stemming)

        tfidf.word_idf(list_tweet_stemming, keyword)
        list_idf = model.select_idf(keyword)

        for index, tweet_steming in enumerate(list_tweet_stemming):
            id_tweet = list_id_tweet[index]
            tweet_tfidf = tfidf.tf_idf(
                preprocessing.tokennizing(tweet_steming), keyword, list_idf)
            model.update_tfidf(id_tweet, keyword, tweet_tfidf)
            list_tfidf.append({'id': id_tweet, 'tfidf': tweet_tfidf})

        data = sorted(list_tfidf, key=lambda k: k['id'])

        model.update_vocab_config(data, keyword)

    except Exception as error:
        print(error)

    finally:
        chache[username]['statuspreprocessing'] = 0
        model.close_connection()
Пример #3
0
class KMeans():

    def __init__(self, model):
        self.preprocessing =  Preprocessing(model)
        self.model         =  model
        self.tf_idf        =  TfIdf(model)

    def mm_normalize(self, data):
        result = []
        data    = np.array(data)

        svd     = TruncatedSVD(n_components=5,n_iter=5)
        data    = svd.fit_transform(data)
        data    = data.tolist()

        # for a in data:
        #     list_cosine = []
        #     for b in data:
        #         cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
        #         list_cosine.append(cos_sim)

        #     result.append(list_cosine)

        return data

    def mm_normalize_predict(self, data, list_data_train):
        list_cosine = []
        result      = []

        svd             = TruncatedSVD(n_components=5,n_iter=5)
        svd.fit(list_data_train)
        list_data_train = svd.transform(list_data_train)
        a               = svd.transform(data)
        a               = a.tolist()

        # for b in list_data_train:
        #     b       = b.tolist()
        #     cos_sim = np.dot(data, b)/(np.linalg.norm(data)*np.linalg.norm(b))
        #     list_cosine.append(cos_sim.tolist()[0])

        return list_cosine

    def manhatan(self, data_1, data_2):

        return sum([abs(data_1[i]-data_2[i]) for i in range(len(data_2))])

    def euclidean(self, data_1, data_2):

        return math.sqrt(sum([(data_1[i]-data_2[i])**2 for i in range(len(data_2))]))

    def jacard(self, data_1, data_2):
        inter= list(set(data_1) & set(data_2))
        I=len(inter)
        union= list(set(data_1) | set(data_2))
        U=len(union)
        return round(1-(float(I)/U),4)

    def up_date(self, cluster, all_data, centroid_before, k):
        new_centroid    =   []

        for centorid in range(k):
            tweet_dalam_kelas_sekarang      = []

            for index, kelas in enumerate(cluster):

                if int(kelas) == int(centorid):
                    tweet_dalam_kelas_sekarang.append(all_data[index])

            centroid_tweet_baru = np.array(tweet_dalam_kelas_sekarang).mean(axis=0) if tweet_dalam_kelas_sekarang else []

            if centroid_tweet_baru == []:
                centroid_tweet_baru = centroid_before[centorid]

            new_centroid.append(centroid_tweet_baru)


        return new_centroid

    def hitung_sse(self, tweet1, centroid):
        result = 0

        result = result + math.pow(self.manhatan(tweet1, centroid),2)

        return result

    def sse(self, keyword, cluster, k, centroid_text, all_data):
        sse_total   =   0

        for centroid in range(k):
            tweet_dalam_kelas_sekarang      = []
            centroid_now = centroid_text[centroid]
            for index, kelas in enumerate(cluster):

                if kelas == centroid:
                    tweet_dalam_kelas_sekarang.append(all_data[index])

            with concurrent.futures.ThreadPoolExecutor(max_workers=200) as executor:

                myfuture = {executor.submit(self.hitung_sse, tweet1, centroid_now) : tweet1 for tweet1 in tweet_dalam_kelas_sekarang}

                for future in concurrent.futures.as_completed(myfuture):
                    sse_total = sse_total + future.result()


        self.write_sse(sse_total, keyword)

        return sse_total

    def output(self, id, keyword, cluster, k):
        dict_final = {}

        for centorid in range(k):
            id_tweet_dalam_kelas_sekarang   = []

            for index2, kelas in enumerate(cluster):
                if kelas == centorid:
                    id_tweet_dalam_kelas_sekarang.append(id[index2])

            dict_final[centorid + 1]=id_tweet_dalam_kelas_sekarang

        self.write_cluster(dict_final, keyword)

    def read_sse(self, keyword):
        sse = self.model.select_sse_args(keyword)

        return sse

    def write_sse(self, data, keyword):

        self.model.save_sse(data, keyword)

    def read_model(self, keyword):
        model = self.model.select_centroid_cluster_args(keyword)

        return model

    def write_model(self, data, cluster, keyword):
        self.model.delete_centroid_cluster(keyword)

        for key, value in data.items():
            cluster_name   =    cluster[key-1]
            self.model.save_centroid_cluster(value, keyword, key, cluster_name)

    def read_cluster(self, keyword):
        cluster_name = self.model.select_cluster_args(keyword)

        return cluster_name

    def write_cluster(self, data, keyword):

        self.model.delete_cluster(keyword)

        for key, value in data.items():

            for i in value:
                self.model.save_cluster(i, keyword, key)

    def fit(self, keyword, id, centroids, cluster_name, all_data, jumlah_tweet, k = 3, iterasi=100):
        centroid_outer  = []
        centroid_inner  = []
        cluster_outer   = []
        cluster_inner   = []
        dict_model_outer= {}
        dict_model_inner= {}
        sse_outer       = 0
        sse_inner       = 0
        all_data        = self.mm_normalize(all_data)

        centroid_text = [all_data[(id.index(int(x)))] for x in centroids]

        for iterat in range(0,int(iterasi)):
            print("iterasi ke ->   ",iterat)
            cluster = []

            for i in range(jumlah_tweet):
                jarak_antar_centorid   = [self.manhatan(all_data[i], centroid_text[j]) for j in range(k)]

                kelas_terdekat         = jarak_antar_centorid.index(min(jarak_antar_centorid))
                cluster.append(kelas_terdekat)

            print(cluster)

            update_centroid_terbaru     = self.up_date(cluster, all_data, centroid_text, k)
            centroid_text               = copy.deepcopy(update_centroid_terbaru)

            dict_model_inner           = {}
            for index, data in enumerate(centroid_text):
                dict_model_inner[index+1]  = data

            sse_inner                   = self.sse(keyword, cluster, k, centroid_text, all_data)

            if sse_inner >= sse_outer and sse_outer !=0:
                break;

            else:
                cluster_outer               = copy.deepcopy(cluster)
                dict_model_outer            = copy.deepcopy(dict_model_inner)
                sse_outer                   = copy.deepcopy(sse_inner)

        self.write_model(dict_model_outer, cluster_name, keyword)
        self.clustering = self.output(id, keyword, cluster_outer, k)
        self.sse(keyword, cluster_outer, k, centroid_text, all_data)

        return 0

    def predict(self, data, keyword):

        try:
            total_tweets, train_data    = self.model.select_data_training(keyword)
            list_preprocessing          = [x['tfidf'] for x in train_data]
            data_lower, data_regex, data_stopword, data_stemming    = self.preprocessing.cleansing(data)
            data         = self.preprocessing.tokennizing(data_stemming)
            data         = [self.tf_idf.tf_idf(data, keyword)]
            data         = self.mm_normalize_predict(data, list_preprocessing)
            model        = self.read_model(keyword)
            model_nm     = self.model.select_cluster_names(keyword)
            k            = len(model)
            d            = [self.manhatan(data, model.get(j+1)) for j in range(k)]
            cluster      = d.index(min(d)) + 1
            cluster_name = model_nm.get(cluster)
            emoticon     = self.model.select_emoticon(cluster, keyword)
        except Exception as error:
            print(error)
        finally:
            self.model.close_connection()

        return cluster_name, emoticon
Пример #4
0
class TfIdf():
    def __init__(self, model):
        self.vocabulary = Vocabulary(model)
        self.proprecessing = Preprocessing(model)
        self.model = model

    def word_idf(self, document, keyword):
        total_document = len(document)
        vocabulary = self.vocabulary.create_vocabulary(document, keyword)
        idf = []
        total = []
        new_vocabulary = []

        for feature in vocabulary:
            total_this_feature = 0
            for data in document:
                list_word = self.proprecessing.tokennizing(data)
                if feature in list_word:
                    total_this_feature = total_this_feature + 1

            if total_this_feature < 2 or total_this_feature > 50:
                pass
            else:
                idf.append(math.log(total_document / total_this_feature))
                total.append(total_this_feature)
                new_vocabulary.append(feature)

        self.write_vocabulary(new_vocabulary, idf, total, keyword)

    def tf(self, word1, data, length_sentences):
        freq_word = 0
        for word2 in data:
            if word1 == word2:
                freq_word = freq_word + 1

        tf = float(freq_word / length_sentences)

        return tf

    def tf_idf(self, data, keyword, list_idf=False):
        value = []
        list_idf = self.read_idf(keyword) if not list_idf else list_idf
        list_word_text_now = []
        list_freq_word_now = []
        length_sentences = len(data)

        for word1 in data:
            if word1 in list_word_text_now:
                pass
            else:
                tf = self.tf(word1, data, length_sentences)
                list_word_text_now.append(word1)
                list_freq_word_now.append(tf)

        for features in list_idf:
            word = features['word']
            idf = features['idf']

            if word in list_word_text_now:
                freq_word_in_sentences = list_freq_word_now[
                    list_word_text_now.index(word)]
                tfidf = float(freq_word_in_sentences) * float(idf)
                value.append(tfidf)
            else:
                value.append(0)

        return value

    def write_vocabulary(self, vocabulary, idf, total, keyword):
        self.vocabulary.write_vocabulary(vocabulary, idf, total, keyword)

    def read_idf(self, keyword):
        idf = self.model.select_idf(keyword)

        return idf