Exemplo n.º 1
0
    def run(self):
        # text = codecs.open('test.csv', 'r', 'utf-8').read()
        self.get_keywords_num()
        train_data = table2df(self.train_data)
        if train_data.index.size == 0:
            raise AttributesError_new(
                "Missing data, the input dataset should have two rows and one column "
                "(the first row is column name,  the second row is the single text.)"
            )
        elif train_data.columns.size > 1:
            raise AttributesError_new("Input data should have only one column")
        elif train_data.index.size > 1:
            raise AttributesError_new(
                "The single text should be filled into one row")
        """ use 1 col and 1 row data """
        train_data = str(train_data.values[0][0])
        #use Textrank to get the keywords
        tr4w = TextRank4Keyword()
        tr4w.analyze(text=train_data, lower=True, window=5)
        list1 = []
        list2 = []
        list3 = []
        list4 = []
        list5 = []
        list6 = []
        result = []
        for item in tr4w.get_keywords(self.keywords_num * 2, word_min_len=2):
            list1.append([item.word, item.weight])
            list6.append(item.word)
        self.keywords_set = list(set(list6))
        #         print('keywords_set:',self.keywords_set)
        #         print(list1)

        #use TF_IDF to get the keywords
        tfidfer = TF_IDF()
        word_dict, candi_dict = tfidfer.build_wordsdict(train_data)
        word_dict1 = tfidfer.build_wordsdict1(train_data)
        for keyword in tfidfer.extract_keywords(train_data,
                                                self.keywords_num * 2):
            list2.append(list(keyword))
#         print(list2)
        for i in range(len(list1)):
            for j in range(len(list2)):
                if list1[i][0] == list2[j][0]:
                    list3.append(list1[i])
        for i in list1:
            if i not in list3:
                list4.append(i)

        length = len(list3)
        if length >= self.keywords_num:
            for i in range(self.keywords_num):
                result.append(list3[i])
        else:
            result = list3
            if len(list4) >= self.keywords_num - length:
                for i in range(self.keywords_num - length):
                    list3.append(list4[i])
                    result = list3
        result.sort(key=lambda k: k[1], reverse=True)
        for i in range(len(result)):
            for word, word_tf in word_dict1.items():
                if result[i][0] == word:
                    result[i].append(int(word_tf))
        # print('result1:',result)

        for i in range(len(result)):
            list7 = []
            list7.append(result[i][0])
            #             print('list7:',list7)
            mapping = list(map(lambda x: self.keywords_set.index(x), list7))
            #             print('mapping:',mapping)
            result[i][0] = mapping[0]
        # print('result2:',result)

        metas = [
            DiscreteVariable('keywords', self.keywords_set),
            ContinuousVariable('weight'),
            ContinuousVariable('word_frequency')
        ]
        #         print('Domain(metas):',Domain(metas))
        #         listma=[[1,2,3],[4,5,6],[3,5,6]]
        #         print(listma)
        domain = Domain(metas)
        #         print('domain.attributes:',domain.attributes)
        #         print('domain.class_vars:',domain.class_vars)
        final_result = Table.from_list(domain, result)
        #         final_result=Table.from_list(Domain(metas),listma)
        print('final_result:', final_result)

        self.send('News', final_result)
        self.send("Metas", metas)
Exemplo n.º 2
0
    def run(self):
        train_data = table2df(self.train_data)
        if train_data.index.size == 0:
            raise AttributesError_new(
                "Missing data, the input dataset should have two rows and one column "
                "(the first row is column name,  the second row is the single text.)"
            )
        elif train_data.columns.size > 1:
            raise AttributesError_new("Input data should have only one column")
        elif train_data.index.size > 1:
            raise AttributesError_new(
                "The single text should be filled into one row")
        """ use 1 col and 1 row data """
        train_data = str(train_data.values[0][0])
        # use Textrank to get the keywords
        tr4s = TextRank4Sentence()
        tr4s.analyze(text=train_data, lower=True, source='all_filters')
        self.proportion_set = ['20%', '30%', '40%', '50%']
        list1 = []
        list2 = []
        list3 = []
        list4 = []
        list5 = []
        list6 = []
        result = []
        for item in tr4s.get_key_sentences(num=4):
            #print(item.index, item.weight, item.sentence)
            list1.append([item.index, item.weight, item.sentence])
        # print('list1',list1)
        list6.append(['20%', list1[0][2]])
        list5.append(list1[0][2])

        for i in range(2):
            list2.append(list1[i])
        list2.sort(key=lambda k: k[0], reverse=False)
        list6.append(['30%', list2[0][2] + list2[1][2]])
        list5.append(list2[0][2] + list2[1][2])

        for i in range(3):
            list3.append(list1[i])
        list3.sort(key=lambda k: k[0], reverse=False)
        list6.append(['40%', list3[0][2] + list3[1][2] + list3[2][2]])
        list5.append(list3[0][2] + list3[1][2] + list3[2][2])

        for i in range(4):
            list4.append(list1[i])
        list4.sort(key=lambda k: k[0], reverse=False)
        list6.append(
            ['50%', list4[0][2] + list4[1][2] + list4[2][2] + list4[3][2]])
        list5.append(list4[0][2] + list4[1][2] + list4[2][2] + list4[3][2])
        self.abstract_set = list5
        #         print('list5',list5)
        #         print('list6',list6)
        for i in range(len(list6)):
            list7 = []
            list8 = []
            list7.append(list6[i][0])
            list8.append(list6[i][1])

            mapping1 = list(map(lambda x: self.proportion_set.index(x), list7))
            mapping2 = list(map(lambda x: self.abstract_set.index(x), list8))
            #             print('mapping1',mapping1[0])
            #             print('mapping2',mapping2[0])
            result.append([mapping1[0], mapping2[0]])
#         print('list6:',list6)
#         print('result:',result)

        metas = [
            DiscreteVariable('proportion', self.proportion_set),
            DiscreteVariable('abstract', self.abstract_set)
        ]

        domain = Domain(metas)
        #         print('domain.attributes:',domain.attributes)
        #         print('domain.class_vars:',domain.class_vars)
        final_result = Table.from_list(domain, result)
        self.send('News', final_result)
        self.send("Metas", metas)

        #         print('result:',result)
        #         print('final_result:',final_result)
        json_res = {}
        temp_lst = []
        fields = ['proportion', 'abstract']
        for i in result:
            temp_dir = {}
            for j, k in enumerate(i):
                if j == 0:
                    temp_dir[fields[j]] = self.proportion_set[k]
                else:
                    temp_dir[fields[j]] = self.abstract_set[k]
            temp_lst.insert(0, temp_dir)
        json_res['visualization_type'] = "summary"
        json_res['results'] = temp_lst

        json_res["chartXName"] = 'proportion'
        json_res["chartYName"] = 'abstract'
        #         json_res["tableCols"] = ['name', 'count']

        #         print('json_res:',json_res)
        self.send('Jsondata', json_res)
Exemplo n.º 3
0
    def run(self):
        # text = codecs.open('test.csv', 'r', 'utf-8').read()
        self.get_keywords_num()
        train_data = table2df(self.train_data)
        
        if train_data.index.size == 0:
            raise AttributesError_new("Missing data, the input dataset should have two rows and one column "
                              "(the first row is column name,the second row is the single text.)")
        elif train_data.columns.size > 1:
            raise AttributesError_new("Input data should have only one column")
        elif train_data.index.size > 1:
            raise AttributesError_new("The single text should be filled into one row")
        """ use 1 col and 1 row data """    
        train_data = str(train_data.values[0][0])
        # use Textrank to get the keywords
        tr4w = TextRank4Keyword()
        tr4w.analyze(text=train_data, lower=True, window=5)
        list1 = []
        list2 = []
        list3 = []
        list4 = []
        list5 = []
        list6 = []
        result = []
        for item in tr4w.get_keywords(self.keywords_num * 2, word_min_len=2):
            list1.append([item.word, item.weight])
            list6.append(item.word)
        self.keywords_set = list(set(list6))
        #         print('keywords_set:',self.keywords_set)
        #         print(list1)

        # use TF_IDF to get the keywords
        tfidfer = TF_IDF()
        word_dict, candi_dict = tfidfer.build_wordsdict(train_data)
        """ add the build_wordsdict1 to figure out words frequence by chenjing """
        word_dict1=tfidfer.build_wordsdict1(train_data)
        for keyword in tfidfer.extract_keywords(train_data, self.keywords_num * 2):
            list2.append(list(keyword))
        #         print(list2)
        for i in range(len(list1)):
            for j in range(len(list2)):
                if list1[i][0] == list2[j][0]:
                    list3.append(list1[i])
        for i in list1:
            if i not in list3:
                list4.append(i)

        length = len(list3)
        if length >= self.keywords_num:
            for i in range(self.keywords_num):
                result.append(list3[i])
        else:
            result = list3
            if len(list4) >= self.keywords_num - length:
                for i in range(self.keywords_num - length):
                    list3.append(list4[i])
                    result = list3
        result.sort(key=lambda k: k[1], reverse=True)
        for i in range(len(result)):
            for word, word_tf in word_dict1.items():
                if result[i][0] == word:
                    result[i].append(int(word_tf))
        # print('result1:',result)

        for i in range(len(result)):
            list7 = []
            list7.append(result[i][0])
            #             print('list7:',list7)
            mapping = list(map(lambda x: self.keywords_set.index(x), list7))
            #             print('mapping:',mapping)
            result[i][0] = mapping[0]
        # print('result2:',result)

        metas = [DiscreteVariable('keywords', self.keywords_set),
                 ContinuousVariable('weight'),
                 ContinuousVariable('word_frequency')]
        #         print('Domain(metas):',Domain(metas))
        #         listma=[[1,2,3],[4,5,6],[3,5,6]]
        #         print(listma)
        domain = Domain(metas)
        #         print('domain.attributes:',domain.attributes)
        #         print('domain.class_vars:',domain.class_vars)
        final_result = Table.from_list(domain, result)
        #         final_result=Table.from_list(Domain(metas),listma)
        # print('final_result:',final_result)
        json_res = {}
        temp_lst = []
        fields = ['name', 'weight', 'count']
        for i in result:
            temp_dir = {}
            for j, k in enumerate(i):
                if j != 0:
                    temp_dir[fields[j]] = k
                else:
                    temp_dir[fields[j]] = self.keywords_set[k]
            temp_lst.insert(0, temp_dir)
        json_res['visualization_type'] = "keywords"
        json_res['results'] = temp_lst

        json_res["chartXName"] = 'weight'
        json_res["chartYName"] = 'name'
        json_res["tableCols"] = ['name', 'count']

        # print(json_res)
        self.send('Jsondata', json_res)
        # json_dicts = json.dumps(json_res, indent=4)
        # with open('c.json', 'w') as f:
        #    json.dump(json_res, f, ensure_ascii=False, indent=4)
        # f.close()
        # print("加载入文件完成...")
        # print(json_dicts)
        self.send('News', final_result)
        self.send("Metas", metas)
Exemplo n.º 4
0
    def rerun(self):
        try:
            self.root_path = self.get_dataset_path()
            """--------add by wwang29---------"""
            if self.temporary_dir is not None:
                self.temporary_dir.cleanup()
            self.set_file_path()

            # os.chdir(self.root_path)
            self.remove_files(self.file_path, 'fasttext_*.txt',
                              show=True)  # in case did not removed
            test_data = table2df(self.test_data)
            test_data = self.add_commentid(test_data)
            #         test_data = test_data.dropna(subset=[self.label_name])
            text_split = test_data
            test_data_raw = test_data  # test data may delete some records
            #         text_split= test_data.drop([self.label_name], axis=1)
            self.text_split(text_split, 'test_split')
            legal_test_data_id = list(test_data.index)

            test_set = open(os.path.join(self.file_path,
                                         'fasttext_test_split.txt'),
                            'r',
                            encoding='utf-8-sig')
            predict_label = self.classifier.predict(test_set)
            predict_label_df = pd.DataFrame(predict_label)
            int_index = []
            for index in predict_label:
                int_index.append(int(index[0]))

            test_set1 = open(os.path.join(self.file_path,
                                          'fasttext_test_split.txt'),
                             'r',
                             encoding='utf-8-sig')
            wind_pre_prob = self.classifier.predict_proba(test_set1)
            wind_pre_prob_index = []
            for index in wind_pre_prob:
                wind_pre_prob_index.append(index[0][1])
            """-----------calculate probility-------------------------------"""

            if len(self.label_domain) == 2:
                metas = [
                    DiscreteVariable('predict', self.label_domain),
                    ContinuousVariable('negative probability'),
                    ContinuousVariable('positive probability')
                ]
            else:
                metas = [
                    DiscreteVariable('predict', self.label_domain),
                    ContinuousVariable('others'),
                    ContinuousVariable('selected class prob')
                ]

            new_two_prob_positive = []
            new_two_prob_negative = []

            for index, item in enumerate(wind_pre_prob_index):
                new_two_prob_negative.append(1 - item)
                new_two_prob_positive.append(item)

            wind_pre_prob = np.array([
                np.array(new_two_prob_negative),
                np.array(new_two_prob_positive)
            ])
            wind_pre_prob = wind_pre_prob.T
            total_pre_prob = self.merge_possibility(len(test_data_raw),
                                                    wind_pre_prob,
                                                    legal_test_data_id)
            cols = [np.array(int_index).reshape(-1, 1), wind_pre_prob]
            aa = np.array(int_index).reshape(-1, 1)

            tbl = np.column_stack(
                (np.array(int_index).reshape(-1, 1), wind_pre_prob))

            res = Table.from_numpy(Domain(metas), tbl)
            final_result = self.merge_data(self.test_data, res)

            self.send("News", final_result)
            self.send("Metric Score", None)
            self.send("Metas", metas)
            self.send("Columns", cols)
            # self.remove_files(self.file_path, '*', show=True)
            self.remove_files(self.file_path, 'fasttext_*.txt', show=True)
            print('rerun')
            test_set.close()
            test_set1.close()
            # shutil.rmtree(self.file_path)
            # self.classifier = None  # in order to pickle the model model
            self.train_data = None
            self.test_data = None
        finally:
            self.temporary_dir.cleanup()
            self.temporary_dir = None
Exemplo n.º 5
0
    def run(self):
        try:
            self.root_path = self.get_dataset_path()
            self.text_clomn_name = self.train_data.domain.attributes[0].name
            # os.chdir(self.root_path)
            self.get_name()

            self.set_file_path()
            test_data = table2df(self.test_data)

            #         try:
            #             test_data.iloc[:, :-len(self.test_data.domain.class_vars)] = \
            #                 test_data.iloc[:, :-len(self.test_data.domain.class_vars)].astype('float')
            #         except Exception as e:
            #             print(e)
            #             raise Error('data must be numeric')
            test_data = test_data.dropna(subset=[self.label_name])
            test_data, test_label_domain = self.label_str2number(test_data)
            test_data = self.add_commentid(test_data)

            if len(test_data) != len(self.test_data):
                raise Error("test data missing label")

            test_data_raw = test_data  # test data may delete some records
            y_test_visual = test_data_raw[self.label_name]
            train_data = table2df(self.train_data)
            train_data = self.add_commentid(train_data)
            #         try:
            #             train_data.iloc[:, :-len(self.train_data.domain.class_vars)] = \
            #                 train_data.iloc[:, :-len(self.train_data.domain.class_vars)].astype('float')
            #         except Exception as e:
            #             print(e)
            #             raise Error('data must be numeric')
            if len(test_data.columns) != len(train_data.columns):
                raise Error('train data and test data must match')
            train_data = train_data.dropna(subset=[self.label_name])
            train_data, self.label_domain = self.label_str2number(train_data)
            #        print("self.label_domain",self.label_domain)
            Traina, validationa = train_test_split(train_data,
                                                   train_size=0.8,
                                                   random_state=1234)
            Traina = self.add_commentid(Traina)
            validationa = self.add_commentid(validationa)
            self.text_split(Traina, 'trainingset')
            legal_test_data_id = list(test_data.index)
            test_split = test_data.drop([self.label_name], axis=1)
            self.text_split(test_data, 'testset')
            self.text_split(test_split, 'test_split')
            self.text_split(validationa, 'validationset')
            # training the model
            """---------add by wwang29----"""
            try:
                classifier_model = NamedTemporaryFile(suffix=".model.bin",
                                                      dir=self.file_path)
                self.classifier = fasttext.supervised(
                    os.path.join(self.file_path, 'fasttext_trainingset.txt'),
                    os.path.splitext(classifier_model.name)[0],
                    label_prefix='__label__',
                    epoch=200,
                    min_count=3,
                    word_ngrams=3,
                    ws=10,
                    thread=32,
                    lr=0.1,
                    dim=200,
                    bucket=5000000)
                self.classifier_code = classifier_model.read()
            finally:
                classifier_model.close()

            # self.classifier = fasttext.supervised(os.path.join(self.file_path, 'fasttext_trainingset.txt'),
            #                                       os.path.join(self.model_path, 'fasttext_test.model'), label_prefix='__label__',
            #                                       epoch=200, min_count=3, word_ngrams=3, ws=10, thread=32, lr=0.1, dim=200,
            #                                       bucket=5000000)
            """--------end wwang29---------"""

            train_result = self.classifier.test(
                os.path.join(self.file_path, 'fasttext_trainingset.txt'))
            train_score = train_result.precision
            validation_result = self.classifier.test(
                os.path.join(self.file_path, 'fasttext_validationset.txt'))
            val_score = validation_result.precision
            test_set = open(os.path.join(self.file_path,
                                         'fasttext_test_split.txt'),
                            'r',
                            encoding='utf-8-sig')
            predict_label = self.classifier.predict(test_set)
            int_index = []
            for index in predict_label:
                int_index.append(int(index[0]))

            test_set1 = open(os.path.join(self.file_path,
                                          'fasttext_test_split.txt'),
                             'r',
                             encoding='utf-8-sig')
            wind_pre_prob = self.classifier.predict_proba(test_set1)
            wind_pre_prob_two_c = wind_pre_prob
            wind_pre_prob_index = []
            for index in wind_pre_prob:
                wind_pre_prob_index.append(index[0][1])
            """-----------calculate probility-------------------------------"""
            if len(self.label_domain) == 2:
                metas = [
                    DiscreteVariable('predict', self.label_domain),
                    ContinuousVariable('negative probability'),
                    ContinuousVariable('positive probability')
                ]
            else:
                metas = [
                    DiscreteVariable('predict', self.label_domain),
                    ContinuousVariable('others'),
                    ContinuousVariable('selected class prob')
                ]

            new_two_prob_positive = []
            new_two_prob_negative = []

            for index, item in enumerate(wind_pre_prob_index):
                new_two_prob_negative.append(1 - item)
                new_two_prob_positive.append(item)

            wind_pre_prob = np.array([
                np.array(new_two_prob_negative),
                np.array(new_two_prob_positive)
            ])
            wind_pre_prob = wind_pre_prob.T
            total_pre_prob = self.merge_possibility(len(test_data_raw),
                                                    wind_pre_prob,
                                                    legal_test_data_id)
            cols = [np.array(int_index).reshape(-1, 1), wind_pre_prob]
            aa = np.array(int_index).reshape(-1, 1)

            tbl = np.column_stack(
                (np.array(int_index).reshape(-1, 1), wind_pre_prob))

            res = Table.from_numpy(Domain(metas), tbl)
            final_result = self.merge_data(self.test_data, res)

            results = None
            N = len(test_data)  # note that self.test_data is orange Table
            results = Results(self.test_data[legal_test_data_id],
                              store_data=True)
            results.folds = None
            results.row_indices = np.arange(N)
            results.actual = np.array(y_test_visual[legal_test_data_id])
            results.predicted = np.array([int_index])

            results.probabilities = np.array([wind_pre_prob])
            # results.probabilities = wind_pre_prob
            """-----changed by wwang29-------"""
            # in sentiment classifier we add "illegal data" in domain, so -1, here, we do not use that
            if len(self.label_domain) > 2:
                results.probabilities = None
            elif len(self.label_domain) == 2:
                # test_set2 = open(os.path.join(self.file_path, 'fasttext_test_split.txt'), 'r', encoding='utf-8-sig')
                # wind_pre_prob = self.classifier.predict_proba(test_set2)
                pro_two_class = []
                for p_index in range(len(wind_pre_prob_two_c)):
                    local_ = [0, 1]
                    local_[results.predicted[0]
                           [p_index]] = wind_pre_prob_two_c[p_index][0][1]
                    local_[results.predicted[0][p_index] -
                           1] = 1 - wind_pre_prob_two_c[p_index][0][1]
                    pro_two_class.append(local_)
                pro_two_class = np.array(pro_two_class)

                results.probabilities[0] = pro_two_class
            """-----------change end by wwang29-----------------"""
            print("predicted", results.predicted[0])
            print("actual", results.actual)

            results.learner_names = ['Multiple_text_Classifier']
            # self.send("Predictions", predictions)
            self.send("Evaluation Results", results)

            metric_frame = MetricFrame(
                [[val_score], [train_score]],
                index=["hyperparam_searcher", "modeler"],
                columns=[MetricType.ACCURACY.name])
            self.send("Metric Score", metric_frame)
            self.send("Columns", cols)
            self.send("Metas", metas)
            self.send("Metric", MetricType.ACCURACY)

            # print(MetricType.ACCURACY)
            print("metric_frame", metric_frame)
            print("result:", results.predicted, results.predicted.shape)

            self.send('News', final_result)
            self.remove_files(self.file_path, 'fasttext_*.txt', show=True)
            test_set.close()
            test_set1.close()
            # self.classifier = None  # in order to pickle the model model
            self.train_data = None
            self.test_data = None
        finally:
            self.temporary_dir.cleanup()
            self.temporary_dir = None
Exemplo n.º 6
0
    def rerun(self):
        test_data = table2df(self.test_data)
        test_data = self.add_commentid(test_data)
        #         test_data = test_data.dropna(subset=[self.label_name])
        text_split = test_data
        test_data_raw = test_data  # test data may delete some records
        #         text_split= test_data.drop([self.label_name], axis=1)
        self.text_split(text_split, 'test_split')
        legal_test_data_id = list(test_data.index)
        #loading the fasttext model
        classifier = fasttext.load_model("fasttext_test.model.bin",
                                         label_prefix="__label__")
        test_set = open('fasttext_test_split.txt', 'r', encoding='utf-8-sig')
        predict_label = classifier.predict(test_set)
        predict_label_df = pd.DataFrame(predict_label)
        int_index = []
        for index in predict_label:
            int_index.append(int(index[0]))

        test_set1 = open('fasttext_test_split.txt', 'r', encoding='utf-8-sig')
        wind_pre_prob = self.classifier.predict_proba(test_set1)
        wind_pre_prob_index = []
        for index in wind_pre_prob:
            wind_pre_prob_index.append(index[0][1])
        """-----------calculate probility-------------------------------"""

        metas = [
            DiscreteVariable('predict', self.label_domain),
            ContinuousVariable('negative probability'),
            ContinuousVariable('positive probability')
        ]

        new_two_prob_positive = []
        new_two_prob_negative = []

        for index, item in enumerate(wind_pre_prob_index):
            new_two_prob_negative.append(1 - item)
            new_two_prob_positive.append(item)

        wind_pre_prob = np.array(
            [np.array(new_two_prob_negative),
             np.array(new_two_prob_positive)])
        wind_pre_prob = wind_pre_prob.T
        total_pre_prob = self.merge_possibility(len(test_data_raw),
                                                wind_pre_prob,
                                                legal_test_data_id)
        cols = [np.array(int_index).reshape(-1, 1), wind_pre_prob]
        aa = np.array(int_index).reshape(-1, 1)

        tbl = np.column_stack((np.array(int_index).reshape(-1,
                                                           1), wind_pre_prob))

        res = Table.from_numpy(Domain(metas), tbl)
        final_result = self.merge_data(self.test_data, res)

        self.send("News", final_result)
        self.send("Metric Score", None)
        self.send("Metas", metas)
        self.send("Columns", cols)
        print('rerun')
        self.remove_files('.', 'fasttext_*.txt', show=True)
Exemplo n.º 7
0
    def run(self):
        self.get_name()

        test_data = table2df(self.test_data)

        #         try:
        #             test_data.iloc[:, :-len(self.test_data.domain.class_vars)] = \
        #                 test_data.iloc[:, :-len(self.test_data.domain.class_vars)].astype('float')
        #         except Exception as e:
        #             print(e)
        #             raise Error('data must be numeric')
        test_data = test_data.dropna(subset=[self.label_name])
        test_data, test_label_domain = self.label_str2number(test_data)
        test_data = self.add_commentid(test_data)

        if len(test_data) != len(self.test_data):
            raise Error("test data missing label")

        test_data_raw = test_data  # test data may delete some records
        y_test_visual = test_data_raw[self.label_name]
        train_data = table2df(self.train_data)
        train_data = self.add_commentid(train_data)
        #         try:
        #             train_data.iloc[:, :-len(self.train_data.domain.class_vars)] = \
        #                 train_data.iloc[:, :-len(self.train_data.domain.class_vars)].astype('float')
        #         except Exception as e:
        #             print(e)
        #             raise Error('data must be numeric')
        if len(test_data.columns) != len(train_data.columns):
            raise Error('train data and test data must match')
        train_data = train_data.dropna(subset=[self.label_name])
        train_data, self.label_domain = self.label_str2number(train_data)
        #        print("self.label_domain",self.label_domain)
        Traina, validationa = train_test_split(train_data,
                                               train_size=0.8,
                                               random_state=1234)
        Traina = self.add_commentid(Traina)
        validationa = self.add_commentid(validationa)
        self.text_split(Traina, 'trainingset')
        legal_test_data_id = list(test_data.index)
        test_split = test_data.drop([self.label_name], axis=1)
        self.text_split(test_data, 'testset')
        self.text_split(test_split, 'test_split')
        self.text_split(validationa, 'validationset')
        #training the model
        self.classifier = fasttext.supervised('fasttext_trainingset.txt',
                                              'fasttext_test.model',
                                              label_prefix='__label__',
                                              thread=32,
                                              epoch=50,
                                              lr=0.1,
                                              dim=200,
                                              bucket=5000000)

        train_result = self.classifier.test('fasttext_trainingset.txt')
        train_score = train_result.precision
        validation_result = self.classifier.test('fasttext_validationset.txt')
        val_score = validation_result.precision
        test_set = open('fasttext_test_split.txt', 'r', encoding='utf-8-sig')
        predict_label = self.classifier.predict(test_set)
        int_index = []
        for index in predict_label:
            int_index.append(int(index[0]))

        test_set1 = open('fasttext_test_split.txt', 'r', encoding='utf-8-sig')
        wind_pre_prob = self.classifier.predict_proba(test_set1)
        wind_pre_prob_index = []
        for index in wind_pre_prob:
            wind_pre_prob_index.append(index[0][1])
        """-----------calculate probility-------------------------------"""

        metas = [
            DiscreteVariable('predict', self.label_domain),
            ContinuousVariable('negative probability'),
            ContinuousVariable('positive probability')
        ]

        new_two_prob_positive = []
        new_two_prob_negative = []

        for index, item in enumerate(wind_pre_prob_index):
            new_two_prob_negative.append(1 - item)
            new_two_prob_positive.append(item)

        wind_pre_prob = np.array(
            [np.array(new_two_prob_negative),
             np.array(new_two_prob_positive)])
        wind_pre_prob = wind_pre_prob.T
        total_pre_prob = self.merge_possibility(len(test_data_raw),
                                                wind_pre_prob,
                                                legal_test_data_id)
        cols = [np.array(int_index).reshape(-1, 1), wind_pre_prob]
        aa = np.array(int_index).reshape(-1, 1)

        tbl = np.column_stack((np.array(int_index).reshape(-1,
                                                           1), wind_pre_prob))

        res = Table.from_numpy(Domain(metas), tbl)
        final_result = self.merge_data(self.test_data, res)

        results = None
        N = len(test_data)  # note that self.test_data is orange Table
        results = Results(self.test_data[legal_test_data_id], store_data=True)
        results.folds = None
        results.row_indices = np.arange(N)
        results.actual = np.array(y_test_visual[legal_test_data_id])
        results.predicted = np.array([int_index])

        results.probabilities = np.array([wind_pre_prob])
        # results.probabilities = wind_pre_prob

        if len(self.label_domain) - 1 > 2:
            results.probabilities = None

        print("predicted", results.predicted[0])
        print("actual", results.actual)

        results.learner_names = ['Multiple_text_Classifier']
        # self.send("Predictions", predictions)
        self.send("Evaluation Results", results)

        metric_frame = MetricFrame([[val_score], [train_score]],
                                   index=["hyperparam_searcher", "modeler"],
                                   columns=[MetricType.ACCURACY.name])
        self.send("Metric Score", metric_frame)
        self.send("Columns", cols)
        self.send("Metas", metas)
        self.send("Metric", MetricType.ACCURACY)

        # print(MetricType.ACCURACY)
        print("metric_frame", metric_frame)
        print("result:", results.predicted, results.predicted.shape)

        self.send('News', final_result)
        self.remove_files('.', 'fasttext_*.txt', show=True)