Пример #1
0
def find_similarity():
    file_text = []
    raw_files = os.listdir(path)
    for file in raw_files:
        fp = open(path + '\\' + file, encoding="utf8")
        filetext = fp.readlines()
        fp.close()
        file_text.append(text_preprocessing(" ".join(filetext)))
    raw_files = os.listdir(files_path)
    for file in raw_files:
        fp = open(files_path + '\\' + file, encoding="utf8")
        filetext = fp.readlines()
        file_text.append(text_preprocessing(" ".join(filetext)))
    fp.close()
    result = raw_files[findSimByTfCos(file_text)]
    if result != 1000:
        val = -1
        for i in range(0, 2):
            val = result.find('_', val + 1)
        return render_template("cmp_files.html",
                               file_name=cmp_query_file_name,
                               folder_name=result[:val],
                               result=cmp_query_file_name + " is similar to " +
                               result[val + 1:])
    else:
        return render_template("cmp_files.html",
                               file_name=cmp_query_file_name,
                               folder_name=result[:val],
                               result="plagiarised")
Пример #2
0
 def get_prediction(self, sentence, start_tag = "[", end_tag = "]"):
     sentence = text_preprocessing(sentence)
     seq = self.data_tokenizer.texts_to_sequences([sentence])
     seq = seq[0]
     result = []
     insert_end = False
     insert_start = True
     result = []
     prev_word = None
     for idx in range(0, len(seq)-NGRAM+1):
         category = self.model.predict(np.atleast_2d([seq[idx: idx+NGRAM]]))
         cat = category.argmax()
         result.append([self.data_index[seq[idx]], self.data_index[seq[idx+1]], cat])
     string_result = []
     for _ in result:
         if _[2] == 1 and insert_start:
             string_result.append(start_tag)
             string_result.append(_[0])
             insert_end = True
             insert_start = False
             prev_word = _[0]
         elif _[2] == 0 and insert_end:
             string_result.append(_[0])
             string_result.append(end_tag)
             insert_end = False
             insert_start = True
         else:
             string_result.append(_[0])
     if result[-1][2] == 1:
         string_result.append(_[1])
         string_result.append(end_tag)
     else:
         string_result.append(_[1])
     return " ".join(string_result)
Пример #3
0
def doc2vec_Fun():
    df = pd.read_excel('fake_new_dataset.xlsx')
    df.title = df.title.astype(str)
    df.text = df.text.astype(str)

    df['news'] = df['title'] + df['text']
    df.drop(labels=['title', 'text'], axis=1, inplace=True)
    df.drop(labels=['subcategory'], axis=1, inplace=True)
    list_label = [0, 0, 1, 1, 0]
    doc = []
    for item in df['news']:
        item = preprocessing.text_preprocessing(item)
        doc.append(item)
        if len(doc) == 5:
            break
    tokenized_doc = []
    for d in doc:
        tokenized_doc.append(word_tokenize(d.lower()))
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
    model = Doc2Vec(tagged_data,
                    vector_size=100,
                    window=2,
                    min_count=1,
                    workers=4,
                    epochs=100)
    list_data = []
    for index in range(0, len(model.dv)):
        list_data.append(model.dv[index])
    return list_data, list_label
Пример #4
0
def predict(data):
    #preprocessing
    data = preprocessing.define_client_type(data)
    data = data['client'] + ' ' + data['topic'] + ' ' + data['description']
    data = preprocessing.text_preprocessing(data)

    prediction = model.predict([data])
    prediction = le.inverse_transform(prediction)

    return prediction[0]
def test_model(file, model_svm, vect, le):
    test_input_data, test_output_data = get_test_data(file)
    test_input_data = text_preprocessing(test_input_data)
    test_input_data = vect.transform(test_input_data)
    test_output_data = le.transform(test_output_data)

    accuracy = model_svm.score(test_input_data, test_output_data)
    predicted_value = model_svm.predict(test_input_data)
    matrix, precision, recall, f1 = evaluating_model(test_output_data, predicted_value)
    return accuracy, matrix, precision, recall,f1
Пример #6
0
def train_model(file):
    input_data, output_data = create_input_output(file)
    input_data = text_preprocessing(input_data)
    vect, input_data = vectorize(input_data)
    le, output_data = label_encoding(output_data)

    print("Training SVM model:\n")
    model_svm = model_support_vector_machine(input_data, output_data)
    print("SVM model trained successfully\n")
    accuracy = model_svm.score(input_data, output_data)
    return accuracy, model_svm, vect, le
Пример #7
0
 def get_scoring(self, sentence):
     sentence = text_preprocessing(sentence)
     seq = self.data_tokenizer.texts_to_sequences([sentence])
     seq = seq[0]
     word_seq = [self.data_index[_] for _ in seq]
     word_score = {}
     for idx in range(0, len(seq)):
         category = self.model.predict(np.atleast_2d([seq[idx: idx+NGRAM]]))
         cat = category.argmax()
         word_score[self.data_index[seq[idx]]] = word_score.get(self.data_index[seq[idx]], 0) + cat
     return word_seq, word_score
Пример #8
0
def predict():
    if request.method == 'POST':
        message = request.form['message']
        if (len(message) != 0):
            text = [message]
            data = preprocessing.text_preprocessing(text)
            vect = cv.transform(data)
            my_prediction = clf.predict(vect)
        else:
            my_prediction = 2

    return render_template('home.html', prediction=my_prediction)
Пример #9
0
def load_texts_for_dataset(file, dir_name):
    """
    function to load text data for training the network.
    on entry accepts:
        file --------- str, file name to download
        dir_name ----- str, directory where the file is located
    returns to output:
        doc ---------- str, read and translated to bottom case text file
    """

    file_name = os.path.join(dir_name, file)
    with open(file_name, 'r', encoding='utf-8') as file_read:
        doc = text_preprocessing(file_read.read())

    return doc
def main():
    # locate and load the dataset
    dataset_loc = "Dataset/Dataset AlQuran Multilabel.xlsx"
    dataset = read_dataset(dataset_loc)[:100]

    # select coloumn terjemahan from dataset
    en_verses = [row[3] for row in dataset]

    # preprocessing phase
    preprocessed_text = [text_preprocessing(verse) for verse in en_verses]

    # tfidf phase
    tfidf_matrix = tf_idf(preprocessed_text)

    # locate and write tfidf matrix
    output_loc = "Output/test_output.xlsx"
    write_data(output_loc, tfidf_matrix)
Пример #11
0
def classify_sentiment(review):
    review = list(review.split('\n'))
    review = text_preprocessing(review)
    review = vect.transform(review)
    result = le.inverse_transform(model.predict(review))
    return result
Пример #12
0
"Perancangan User Experience Aplikasi Pendukung Evaluasi dan Analisis Proses Pembelajaran untuk Guru Berbasis Android dengan Metode User-Centered Design dan Design Solution",
"Pengaruh Kualitas Implementasi Model Pembelajaran Tipe Student Teams Achievements Divisions (STAD) dan Model Pembelajaran Tipe Numbered Head Together (NHT) terhadap Hasil Belajar Siswa Kelas X Program Keahlian Teknik Komputer dan Informatika Mata Pelajaran",
"Pengembangan Sistem Manajemen Penjadwalan Les Privat Berbasis Web (Studi Kasus: Naoyuki Academic Center)",
"Pemodelan Arsitektur Bisnis Guna Mendukung Bisnis Berkelanjutan Menggunakan Pendekatan Enterprise Architecture (Studi Kasus: Kedai Kopi “Kopi Soe Malang”)",
"Prediksi Harga Emas Dengan Menggunakan Metode Average-Based Fuzzy Time Series",
"Evaluasi Usability dan Rekomendasi Perbaikan pada Aplikasi E-Kinerja Kabupaten Kediri menggunakan Metode Heuristic Evaluation",
"Pengembangan Sistem Manajemen Notulensi dan Dokumentasi Rapat Berbasis Web (Studi Kasus: Jurusan Teknik Informatika Fakultas Ilmu Komputer Universitas Brawijaya)",
"Pengembangan Sistem Monitoring Tingkat Stres berbasis Website",
"Temu Kembali Informasi Lintas Bahasa Dokumen Berita Bahasa Indonesia-Inggris menggunakan Metode BM25F",
"Prediksi Kecenderungan Pelanggan Telat Bayar pada Layanan Pembiayaan Adira Finance Saluran E-Commerce",
"Pengembangan Modul Digital Interaktif Berbasis Website menggunakan Kerangka Kerja Borg, Gall, And Gall pada Mata Pelajaran Administrasi Sistem Jaringan di SMK Negeri 12 Malang",
"Klasifikasi Jurusan Siswa menggunakan K-Nearest Neighbor dan Optimasi dengan Algoritme Genetika (Studi Kasus: SMAN 1 Wringinanom Gresik)",
"Analisis Pengalaman Pengguna Aplikasi Pemesanan Tiket Bioskop menggunakan User Experience Questionnaire (UEQ) dan Heuristic Evaluation (HE)",
"Evaluasi dan Perancangan User Experience menggunakan Metode Human Centered Design dan Heuristic Evaluation pada Aplikasi Dunia Games"]]

    preprocessing_doc = preprocessing.text_preprocessing(document)
    document_weighting = tfidf.tfIdfCalculation([preprocessing_doc])
    # preprocessing_doc = []
    #
    # for d in document:
    #     preprocessing_doc.append(preprocessing.text_preprocessing(d))

    # document_weighting = tfidf.tfIdfCalculation(preprocessing_doc)
    print(document_weighting)


    # som = som.selfOrganizingMaps(document_weighting, 0.6, 0.5, 10)
    # print(som)

    # Visualisasi
    # X, target = make_blobs(n_samples=30, n_features=2, centers=3)
def main():
    # locate and load the dataset
    dataset_loc = "./Dataset AlQuran Multilabel.xlsx"
    dataset = read_dataset(dataset_loc)[:200]

    # select coloumn terjemahan from dataset
    en_verses = [row[3] for row in dataset]

    # preprocessing phase
    preprocessed_text = [text_preprocessing(verse) for verse in en_verses]

    # tfidf phase
    tfidf_matrix, vocab = tf_idf(preprocessed_text[:200])

    k_fold = 5
    pnn = 4

    tfidf_matrix_split = chunkIt(tfidf_matrix, k_fold)
    print tfidf_matrix_split
    print len(tfidf_matrix_split)

    akurasi = []
    count_all_hamming = 0

    for i in range(k_fold):

        print("Fold -", i + 1)
        selisih = 0
        count_test = 0
        for i_label in range(4, 20):
            test = []
            train = []

            target_train = []
            target_test = []
            target_actual = []

            label = [row[i_label] for row in dataset]
            label_split = chunkIt(label, k_fold)

            for j in range(len(tfidf_matrix_split)):
                if j == i:
                    test.extend(tfidf_matrix_split[j])
                    target_test.extend(label_split[j])
                else:
                    train.extend(tfidf_matrix_split[j])
                    target_train.extend(label_split[j])

    #
            count_test = len(test)
            print("Label -", i_label - 3)
            target_output = []

            for a in range(len(test)):
                jarak = []
                for b in range(len(train)):

                    jarak.append(
                        math.sqrt(
                            sum(
                                np.subtract(np.array(test[a]),
                                            np.array(train[b]))**2)))

                print len(jarak)

                jarakKlas = []
                for b in range(2):  #[0,1]
                    tmp_jarakKlas = []
                    value_a = []
                    for c in range(len(jarak)):
                        if target_train[c] == b:
                            tmp_jarakKlas.append(jarak[c])
                    tmp_jarakKlas.sort()
                    value_a.sort()
                    for line in tmp_jarakKlas[:pnn]:
                        jarakKlas.append([b, line])

                b = {}
                for x in jarakKlas:
                    b.setdefault(x[0], []).append(x[1])

                avgDict = {}
                x = 1
                for k, v in b.items():
                    avgDict[k] = sum((1.0 / i) * g for i, g in enumerate(v, 1))
                    x += 1

                min_index, min_value = min(
                    avgDict.items(),
                    key=lambda x: x[1])  #untuk mendapatkan min. index
                target_actual.append(min_index)

            for y in range(len(target_test)):
                if target_actual[y] != target_test[y]:
                    selisih += 1

        print selisih
        print count_test
        nilai = count_test * 16
        hamming_loss = (float(1 / (nilai)) * selisih)
        print("hamming_loss :", hamming_loss)
        count_all_hamming += hamming_loss
        print("")

    print "hasil Hamming Keseluruhan"
    avg_hamming = count_all_hamming / k_fold
    print avg_hamming