Пример #1
0
    def get_train_data(self):
        # Tạo train data
        df_train = pd.read_csv(
            codecs.open('filtered_data/vnexpress.csv', 'r', 'utf-8'))
        model = SVMModel()

        clf = model.clf.fit(df_train["content"], df_train.label)
        return clf
Пример #2
0
    def get_train_data(self):
        train_data = []
        fr_train = open('generated_files/cleanDataTrainVi.txt')
        for line in iter(fr_train.readline, ''):
            string_feature = unicode(line.rstrip(), "utf-8")

            string_target = unicode(fr_train.readline().rstrip(), "utf-8)")
            train_data.append({
                "feature": string_feature,
                "target": string_target
            })
        df_train = pd.DataFrame(train_data)

        #test data
        test_data = []
        accuracy = []

        fr_test = open('generated_files/cleanDataTestVi.txt')
        for line in iter(fr_test.readline, ''):
            string_feature = unicode(line.rstrip(), "utf-8")
            string_target = unicode(fr_test.readline().rstrip(), "utf-8)")
            accuracy.append(string_target)
            test_data.append({
                "feature": string_feature,
                "target": string_target
            })
        df_test = pd.DataFrame(test_data)

        model = SVMModel()
        clf = model.clf.fit(df_train["feature"], df_train.target)
        predicted = clf.predict(df_test["feature"])
        # Print predicted result
        h = 0
        for i in range(1, len(accuracy)):
            if predicted[i - 1] == accuracy[i - 1]:
                h += 1
        print '%f' % (h / float(i))
        print clf.predict_proba(df_test["feature"])

        while True:
            raw = raw_input("nhap gi do:")
            decoded = raw.decode("utf-8")
            test = []
            test.append({"feature": decoded, "target": u'HOTEL'})
            test_df = pd.DataFrame(test)
            print(clf.predict(test_df["feature"]))
Пример #3
0
    def predict(self, input):
        test_data = []
        test_data.append({"feature": input, "target": "hoi_thoi_tiet"})
        df_test = pd.DataFrame(test_data)

        # init model naive bayes
        model = SVMModel()
        filename = 'svm_model.sav'
        clf = pickle.load(open(filename, 'rb'))
        predicted = clf.predict(df_test["feature"])
        Probability = clf.predict_proba(df_test["feature"])[0]

        # Print predicted result
        # print(predicted)
        # print(clf.predict_proba(df_test["feature"]))
        # print(clf.predict_proba(df_test["feature"]))

        return predicted, Probability
    def get_train_data(self):
        #  train data
        train_data = []
        train_data.append({
            "feature": u"Hôm nay trời đẹp không ?",
            "target": "hoi_thoi_tiet"
        })
        train_data.append({
            "feature": u"Hôm nay thời tiết thế nào ?",
            "target": "hoi_thoi_tiet"
        })
        train_data.append({
            "feature": u"Hôm nay mưa không ?",
            "target": "hoi_thoi_tiet"
        })
        train_data.append({"feature": u"Chào em gái", "target": "chao_hoi"})
        train_data.append({"feature": u"Chào bạn", "target": "chao_hoi"})
        train_data.append({"feature": u"Hello bạn", "target": "chao_hoi"})
        train_data.append({"feature": u"Hi kimi", "target": "chao_hoi"})
        train_data.append({"feature": u"Hi em", "target": "chao_hoi"})
        df_train = pd.DataFrame(train_data)

        #  test data
        test_data = []
        test_data.append({
            "feature": u"Nóng quá, liệu mưa không em ơi?",
            "target": "hoi_thoi_tiet"
        })
        df_test = pd.DataFrame(test_data)

        # init model naive bayes
        model = SVMModel()

        clf = model.clf.fit(df_train["feature"], df_train.target)

        predicted = clf.predict(df_test["feature"])

        # Print predicted result
        print predicted
        print clf.predict_proba(df_test["feature"])
Пример #5
0
    def train_data(self):
        train = open('data/giaitri.json', 'r', encoding='utf-8-sig')
        train_data = []
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "giai_tri"
            })

        train = open('data/giaoduc.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "giao_duc"
            })

        train = open('data/kinhdoanh.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "kinh_doanh"
            })

        train = open('data/phapluat-tintuc.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "phap_luat_tin_tuc"
            })

        train = open('data/thegioi.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "the_gioi"
            })

        train = open('data/thethao.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "the_thao"
            })

        train = open('data/thoisu.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "thoi_su"
            })

        train = open('data/tuvan.json', 'r', encoding='utf-8-sig')
        for line in train:
            data = json.loads(line)
            train_data.append({
                "feature":
                (str(data['title']).strip() + ' ' + str(data['sapo']).strip() +
                 ' ' + str(data['title']).strip() + ' ' +
                 str(data['sapo']).strip() + ' ' + str(data['text']).strip()),
                "target":
                "tu_van"
            })

        df_train = pd.DataFrame(train_data)
        model = SVMModel()
        clf = model.clf.fit(df_train["feature"], df_train.target)
        filename = 'svm_model.sav'
        pickle.dump(clf, open(filename, 'wb'))
Пример #6
0
    def get_train_data(self):
        common = cm.Common()
        #  train data

        url = "people.csv"
        #train_data = TextClassificationPredict.connectMysql()

        train_data = TextClassificationPredict.readCSV(url)

        checkdata = TextClassificationPredict.readCSV("peoplemaster.csv")
        print(checkdata)

        df_train = pd.DataFrame(train_data)
        chectrain = pd.DataFrame(checkdata)

        df_train['category_id'] = df_train['master_room_type'].factorize()[0]
        train_outcome = pd.crosstab(
            index=train_data["master_room_type"],  # Make a crosstab
            columns="count")  # Name the count column

        df_train['room_name'] = df_train["room_name"].apply(
            TextClassificationPredict.clean_text)
        chectrain['room_name'] = chectrain["room_name"].apply(
            TextClassificationPredict.clean_text)

        dfview = df_train.drop(df_train[df_train['view'] == "Other"].index)
        dfBedType = df_train.drop(
            df_train[df_train['bedType'] == "Other"].index)
        dfBed = df_train.drop(df_train[df_train['bed'] == "Other"].index)

        target = train_data['master_room_type']
        #target = checkdata['master_room_type']
        targetview = dfview['view']

        targetBedType = dfBedType['bedType']
        targetBed = dfBed['bed']

        traindata, testdata, labels_train, labels_test = train_test_split(
            df_train, target, test_size=0.2, random_state=10)
        traindataview, testdataview, labels_trainview, labels_testview = train_test_split(
            dfview, targetview, test_size=0.2, random_state=10)
        traindataBedType, testdataBedType, labels_trainBedType, labels_testBedType = train_test_split(
            dfBedType, targetBedType, test_size=0.2, random_state=10)
        traindataBed, testdataBed, labels_trainBed, labels_testBed = train_test_split(
            dfBed, targetBed, test_size=0.2, random_state=10)

        #model = NaiveBayesModel()
        model = SVMModel()
        modelview = SVMModel()
        modelBedType = SVMModel()
        modelBed = SVMModel()

        clf = model.clf.fit(traindata["room_name"], traindata.master_room_type)
        clfview = modelview.clf.fit(traindataview["room_name"],
                                    traindataview.view)
        clfBedType = modelBedType.clf.fit(traindataBedType["room_name"],
                                          traindataBedType.bedType)
        clfBed = modelBed.clf.fit(traindataBed["room_name"], traindataBed.bed)

        predicted = clf.predict(testdata['room_name'].apply(
            TextClassificationPredict.clean_text))
        predictedview = clfview.predict(testdataview['room_name'].apply(
            TextClassificationPredict.clean_text))
        predictedBedType = clfBedType.predict(
            testdataBedType['room_name'].apply(
                TextClassificationPredict.clean_text))
        predictedBed = clfBed.predict(testdataBed['room_name'].apply(
            TextClassificationPredict.clean_text))

        #print (predicted)
        print('accuracy %s' % accuracy_score(predicted, labels_test))
        print('accuracyView %s' %
              accuracy_score(predictedview, labels_testview))
        print('accuracyBedType %s' %
              accuracy_score(predictedBedType, labels_testBedType))
        print('accuracyBed %s' % accuracy_score(predictedBed, labels_testBed))

        a = clf.predict_proba(testdata["room_name"])
        TextClassificationPredict.save_model(
            os.path.abspath(os.path.dirname(__file__)) + "/x_transformer.pkl",
            clf)
        TextClassificationPredict.save_model(
            os.path.abspath(os.path.dirname(__file__)) +
            "/x_transformerView.pkl", clfview)
        TextClassificationPredict.save_model(
            os.path.abspath(os.path.dirname(__file__)) +
            "/x_transformerBedType.pkl", clfBedType)
        TextClassificationPredict.save_model(
            os.path.abspath(os.path.dirname(__file__)) +
            "/x_transformerViewBed.pkl", clfBed)

        dt = pd.DataFrame(testdata)
        dt["predicted"] = predicted
 def set_classification_model(self, type=SVM):
     if type==SVM:
         self.model = SVMModel()
class Pipeline:
    def __init__(self):
        self.set_trainset_directory()
        self.set_testset_directory()
        self.set_classification_model()
        self.set_features()
        self.extract_train_features()
        self.extract_train_labels()
        self.extract_test_features()
        self.extract_test_labels()

    def set_trainset_directory(self, directory="../data/train_pairs.csv"):
        self.trainset = DataSet(directory=directory)
        self.trainset.read_data()

    def set_testset_directory(self, directory="../data/test_pairs.csv"):
        self.testset = DataSet(directory=directory)
        self.testset.read_data()

    def set_classification_model(self, type=SVM):
        if type==SVM:
            self.model = SVMModel()

    def set_features(self, features=[SIMPLE_MATCHING, LAVENSHTEIN_DISTANCE, ROUGE_S,
                                     CONSECUTIVE_SUBSEQUENCE_MATCHING, TRI_GRAM_CHARACTER]):
        self.feature_extractors = []
        if SIMPLE_MATCHING in features:
            self.feature_extractors.append(SimpleMatchingExtractor())
        if LAVENSHTEIN_DISTANCE in features:
            self.feature_extractors.append(LavenshteinExtractor())
        if ROUGE_S in features:
            self.feature_extractors.append(RougeSExtractor())
        if CONSECUTIVE_SUBSEQUENCE_MATCHING in features:
            self.feature_extractors.append(ConsecutiveSubsequenceMatchingExtractor())
        if TRI_GRAM_CHARACTER in features:
            self.feature_extractors.append(TriGramCharacterExtractor())

    def extract_features(self, dataset: DataSet):
        X = []
        for sentence_pair in dataset.sentence_pairs:
            x = []
            token_pair = ProcessSentencePair(WordTokenizePreProcessor().transform(sentence_pair.text),
                                             WordTokenizePreProcessor().transform(sentence_pair.hypothesis))
            pos_pair = ProcessSentencePair(PosTagPreProcessor().transform(sentence_pair.text, token_pair.text),
                                           PosTagPreProcessor().transform(sentence_pair.hypothesis, token_pair.hypothesis))
            stem_pair = ProcessSentencePair(StemPreProcessor().transform(sentence_pair.text, token_pair.text),
                                            StemPreProcessor().transform(sentence_pair.hypothesis, token_pair.hypothesis))
            lemma_pair = ProcessSentencePair(LemmaPreProcessor().transform(sentence_pair.text, pos_pair.text),
                                             LemmaPreProcessor().transform(sentence_pair.hypothesis, token_pair.hypothesis))

            for feature_extractor in self.feature_extractors:
                x.append(feature_extractor.transform(sentence_pair, token_pair))
                x.append(feature_extractor.transform(sentence_pair, stem_pair))
                x.append(feature_extractor.transform(sentence_pair, lemma_pair))
            X.append(x)
        return X

    def extract_labels(self, dataset: DataSet):
        Y = []
        for sentence_pair in dataset.sentence_pairs:
            Y.append(sentence_pair.label)
        return Y

    def extract_train_features(self):
        self.train_features = self.extract_features(dataset=self.trainset)

    def extract_train_labels(self):
        self.train_labels = self.extract_labels(dataset=self.trainset)

    def extract_test_features(self):
        self.test_features = self.extract_features(dataset=self.testset)

    def extract_test_labels(self):
        self.test_labels = self.extract_labels(dataset=self.testset)

    def train_classification_model(self):
        self.model.fit(self.train_features, self.train_labels)

    def test_classification_model(self):
        self.test_predicts = self.model.transform(self.test_features)
        print(accuracy_score(self.test_labels, self.test_predicts))

    def predict(self, dataset: DataSet):
        X = self.extract_features(dataset=dataset)
        return self.model.transform(X)
Пример #9
0
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

data = pd.read_csv(codecs.open('../train_data/data_dantri.csv', 'r', 'utf-8'))
data = data.append(pd.read_csv(
    codecs.open('../train_data/vnexpress.csv', 'r', 'utf-8')),
                   ignore_index=True)

data = data.loc[data.sample(frac=1).groupby('label').cumcount() <= 2000]

X, y = data.content, data.label
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

start = time.time()
model = SVMModel('rbf')
clf = model.clf.fit(X_train, y_train)

print(time.time() - start)
pkl_filename = "svm_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)
data = pd.read_csv(codecs.open('train_data/vietnamnet.csv', 'r', 'utf-8'))
X_test, y_test = data.content, data.label
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))