예제 #1
0
    def test_json_generate_report(self):
        """ Test report rendering with json config file """
        start_time = datetime.now().replace(microsecond=0)
        controller = Controller(config=Configuration(self.json))
        controller.render()
        end_time = datetime.now().replace(microsecond=0)

        # -- PDF Report --
        output = "%s/%s.pdf" % (self.out_path, self.json_writer_pdf_name)
        print("JSON-PDF report generated %s:" % output)

        with open(output, 'rb') as f:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        print(info)
        self.assertEqual(info['/Title'], self.json_report_name)

        # print(info['/CreationDate'])
        report_time = datetime.strptime(
            info['/CreationDate'], 'D:%Y%m%d%H%M%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        print(number_of_pages)
        self.assertEqual(number_of_pages, self.json_writer_pdf_page_number)

        # -- HTML Report --
        output = "%s/%s.html" % (self.out_path, self.json_writer_html_name)
        print("JSON-HTML report generated %s:" % output)

        with open(output) as f:
            read_data = f.read()

        index = read_data.find(self.json_report_name)
        # -- the header start at index 1279 --
        self.assertEqual(index, 1279)

        index = read_data.find('created on')
        create_date = read_data[index+11: index+30]
        print(create_date)
        report_time = datetime.strptime(create_date, '%Y-%m-%d %H:%M:%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        number_of_tags = read_data.count('class="tab_contents"')
        # print(number_of_tags)
        self.assertEqual(number_of_tags, self.json_writer_html_tag_number)
예제 #2
0
    def test_json_generate_report(self):
        """ Test report rendering with json config file """
        start_time = datetime.now().replace(microsecond=0)
        controller = Controller(config=Configuration(self.json))
        controller.render()
        end_time = datetime.now().replace(microsecond=0)

        # -- PDF Report --
        output = "%s/%s.pdf" % (self.out_path, self.json_writer_pdf_name)
        print("JSON-PDF report generated %s:" % output)

        with open(output, 'rb') as f:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        print(info)
        self.assertEqual(info['/Title'], self.json_report_name)

        # print(info['/CreationDate'])
        report_time = datetime.strptime(
            info['/CreationDate'], 'D:%Y%m%d%H%M%S')
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        print(number_of_pages)
        self.assertEqual(number_of_pages, self.json_writer_pdf_page_number)
예제 #3
0
    def test_json_generate_report(self):
        # Set seed for reproducibility
        np.random.seed(123456)
        """ Test report rendering with json config file """
        # Load the dataset and prepare training and test sets
        train_file = prepare_input_path(working_path='sample_input/housing_price/train.csv')
        data = pd.read_csv(train_file)
        data.dropna(axis=0, subset=['SalePrice'], inplace=True)
        y = data.SalePrice

        X = data.drop(['SalePrice', 'Id'], axis=1).select_dtypes(
            exclude=['object'])
        train_X, test_X, train_y, test_y = train_test_split(X.values, y.values,
                                                            test_size=0.25)

        my_imputer = SimpleImputer()
        train_X = my_imputer.fit_transform(train_X)
        test_X = my_imputer.transform(test_X)

        my_model = GradientBoostingRegressor(n_estimators=1000,
                                             max_depth=5,
                                             learning_rate=0.1,
                                             subsample=0.7,
                                             random_state=42)
        hist = my_model.fit(train_X, train_y)

        X.columns.tolist()
        train_X_df = pd.DataFrame(data=train_X, columns=X.columns.tolist())
        clf = my_model
        clf_fn = my_model.predict
        y_train = []
        feature_names = X.columns.tolist()
        target_names_list = ['SalePrice']

        start_time = datetime.now().replace(microsecond=0)
        controller = Controller(config=Configuration(self.json, locals()))
        controller.render()
        end_time = datetime.now().replace(microsecond=0)

        # -- PDF Report --
        output = "%s/%s.pdf" % (self.out_path, self.json_writer_pdf_name)
        print("JSON-PDF report generated %s:" % output)

        with open(output, 'rb') as f:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        print(info)
        self.assertEqual(info['/Title'], self.json_report_name)

        # print(info['/CreationDate'])
        report_time = datetime.strptime(
            info['/CreationDate'], 'D:%Y%m%d%H%M%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        print(number_of_pages)
        self.assertEqual(number_of_pages, self.json_writer_pdf_page_number)
예제 #4
0
    def test_json_generate_report(self):
        # Set seed for reproducibility
        """ Test report rendering with json config file """
        # Load the dataset and prepare training and test sets
        train_file = prepare_input_path(
            working_path='sample_input/housing_price_halfmil/train.csv')

        df_data = pd.read_csv(train_file, header=0, nrows=300)
        # Get predictor and target
        X = df_data.drop("Prices", axis=1).fillna(value=0)
        y = df_data["Prices"].fillna(value=0)

        train_X, test_X, train_y, test_y = train_test_split(X.values,
                                                            y.values,
                                                            test_size=0.25)
        # Train regression
        from sklearn.linear_model import Lasso
        alpha_list = [0.01, 0.1, 1, 2, 5, 10]
        model_list = []
        r2_list = []
        for alpha in alpha_list:
            lm = Lasso(alpha)
            lm.fit(train_X, train_y)
            model_list.append(lm)
            # model quality
            y_pred = lm.predict(test_X)
            r2 = lm.score(test_X, test_y)
            r2_list.append(r2)
            print('Alpha: %s. R2: %s' % (alpha, r2))

        index = r2_list.index(max(r2_list))
        lm = model_list[index]

        feature_names = X.columns.tolist()
        clf = lm
        clf_fn = lm.predict

        print('Subsetting training data to %s to speed up. ' % self.limit_size)
        train_X = train_X[:self.limit_size]
        start_time = datetime.now().replace(microsecond=0)
        controller = Controller(config=Configuration(self.json, locals()))
        controller.render()
        end_time = datetime.now().replace(microsecond=0)

        # -- PDF Report --
        output = "%s/%s.pdf" % (self.out_path, self.json_writer_pdf_name)
        print("JSON-PDF report generated %s:" % output)

        with open(output, 'rb') as f:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        print(info)
        self.assertEqual(info['/Title'], self.json_report_name)

        # print(info['/CreationDate'])
        report_time = datetime.strptime(info['/CreationDate'],
                                        'D:%Y%m%d%H%M%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        print(number_of_pages)
        self.assertEqual(number_of_pages, self.json_writer_pdf_page_number)
예제 #5
0
    def test_json_generate_report(self):
        # Set seed for reproducibility
        np.random.seed(123456)
        """ Test report rendering with json config file """
        # -- Train on a subset of categories --
        categories = [
            'rec.sport.baseball',
            'soc.religion.christian',
            'sci.med'
        ]

        raw_train = datasets.fetch_20newsgroups(data_home=prepare_input_path(working_path='sample_input/20news'),
                                                subset='train',
                                                categories=categories)
        print('Training dataset keys:', list(raw_train.keys()))
        print('Training class name:', raw_train.target_names)
        print('Training sample target:', raw_train.target[:10])

        raw_test = datasets.fetch_20newsgroups(subset='test',
                                               categories=categories)

        X_train = raw_train.data
        vectorizer = TfidfVectorizer()
        X_train_vec = vectorizer.fit_transform(X_train)
        y_train = raw_train.target

        X_test_vec = vectorizer.transform(raw_test.data)
        y_test = raw_test.target

        print('Training sample:', len(X_train))
        print('--------------------')
        print(X_train[0])
        print('--------------------')

        clf = MultinomialNB(alpha=0.1)
        clf.fit(X_train_vec, y_train)

        print('Subsetting training sample to %s to speed up.' % self.limit_size)

        X_train = X_train[:self.limit_size]
        print('Classifier score:', clf.score(X_test_vec, y_test))
        print('Classifier predict func:', clf.predict_proba)

        def predict_fn(instance):
            vec = vectorizer.transform(instance)
            return clf.predict_proba(vec)

        print('Testing sample prob:', predict_fn(raw_test.data[:10]))

        # -- Instantiate the explainer --
        explainer = ExplainerFactory.get_explainer(domain=xai.DOMAIN.TEXT)
        explainer.build_explainer(predict_fn)
        print('Testing sample explanation:', explainer.explain_instance(raw_test.data[0]))

        feature_names = []
        clf_fn = predict_fn
        target_names_list = []

        start_time = datetime.now().replace(microsecond=0)
        controller = Controller(config=Configuration(self.json, locals()))
        controller.render()
        end_time = datetime.now().replace(microsecond=0)

        # -- PDF Report --
        output = "%s/%s.pdf" % (self.out_path, self.json_writer_pdf_name)
        print("JSON-PDF report generated %s:" % output)

        with open(output, 'rb') as f:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        print(info)
        self.assertEqual(info['/Title'], self.json_report_name)

        # print(info['/CreationDate'])
        report_time = datetime.strptime(
            info['/CreationDate'], 'D:%Y%m%d%H%M%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        print(number_of_pages)
        self.assertEqual(number_of_pages, self.json_writer_pdf_page_number)
예제 #6
0
    def test_json_generate_report(self):
        # Set seed for reproducibility
        np.random.seed(123456)
        """ Test report rendering with json config file """
        # Load the dataset and prepare training and test sets
        raw_data = datasets.load_breast_cancer()
        X, y = raw_data['data'], raw_data['target']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)
        feature_names = raw_data['feature_names']
        target_names_list = list(raw_data['target_names'])

        # Instantiate a classifier, train, and evaluate on test set
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        clf.score(X_test, y_test)
        clf_fn = clf.predict_proba

        start_time = datetime.now().replace(microsecond=0)
        controller = Controller(config=Configuration(self.json, locals()))
        controller.render()
        end_time = datetime.now().replace(microsecond=0)

        # -- PDF Report --
        output = "%s/%s.pdf" % (self.out_path, self.json_writer_pdf_name)
        print("JSON-PDF report generated %s:" % output)

        with open(output, 'rb') as f:
            pdf = PdfFileReader(f)
            info = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        print(info)
        self.assertEqual(info['/Title'], self.json_report_name)

        # print(info['/CreationDate'])
        report_time = datetime.strptime(info['/CreationDate'],
                                        'D:%Y%m%d%H%M%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        print(number_of_pages)
        self.assertEqual(number_of_pages, self.json_writer_pdf_page_number)

        # -- HTML Report --
        output = "%s/%s.html" % (self.out_path, self.json_writer_html_name)
        print("JSON-HTML report generated %s:" % output)

        with open(output) as f:
            read_data = f.read()

        index = read_data.find(self.json_report_name)
        # -- the header start at index 1279 --
        self.assertEqual(index, 1279)

        index = read_data.find('created on')
        create_date = read_data[index + 11:index + 30]
        print(create_date)
        report_time = datetime.strptime(create_date, '%Y-%m-%d %H:%M:%S')
        print("{} {} {}".format(start_time, report_time, end_time))
        self.assertTrue(time_in_range(start_time, end_time, report_time))

        number_of_tags = read_data.count('class="tab_contents"')
        # print(number_of_tags)
        self.assertEqual(number_of_tags, self.json_writer_html_tag_number)