Пример #1
0
 def rule_based(self, issues):
     """
     This method applies rule_based algorithms to predict labels
     Args:
         issues(list): a list of issue numbers
     Return:
         rule_based_predictions(list of lists): labels which satisfy rules
     """
     DF = DataFetcher()
     df_test = DF.fetch_issues(issues)
     rule_based_predictions = []
     for i in range(len(issues)):
         # extract every issue's title
         row = df_test.loc[i, 'title']
         # apply rule-based algorithms
         single_issue_predictions = []
         if "feature request" in row.lower():
             single_issue_predictions.append("Feature")
         if "c++" in row.lower():
             single_issue_predictions.append("C++")
         tokens = self.tokenize(row)
         for k, v in self.keywords.items():
             for keyword in v:
                 if keyword in tokens:
                     single_issue_predictions.append(k)
         rule_based_predictions.append(single_issue_predictions)
     return rule_based_predictions
Пример #2
0
 def ml_predict(self, issues, threshold=0.3):
     """
     This method applies machine learning algorithms to predict labels
     Args:
         issues(list): a list of issue numbers
         threshold(float): threshold of probability
     Return:
         ml_predictions(list of lists): predictions
     """
     # step1: fetch data
     DF = DataFetcher()
     df_test = DF.fetch_issues(issues)
     # step2: data cleaning
     SP = SentenceParser()
     SP.data = df_test
     SP.clean_body('body', True, True)
     SP.merge_column(['title', 'title', 'title', 'body'], 'train')
     test_text = SP.process_text('train', True, False, True)
     # step3: word embedding
     test_data_tfidf = self.tv.transform(test_text).toarray()
     le = LabelEncoder()
     le.fit_transform(self.labels)
     # step4: classification
     probs = self.clf.predict_proba(test_data_tfidf)
     # pick up top 2 predictions which exceeds threshold
     best_n = np.argsort(probs, axis=1)[:, -2:]
     ml_predictions = []
     for i in range(len(best_n)):
         # INFO:Predictor:issue:11919,Performance:0.47353076240017744,Question:0.2440056213336274
         logging.info("issue:{}, {}:{}, {}:{}".format(
             str(issues[i]), str(le.classes_[best_n[i][-1]]),
             str(probs[i][best_n[i][-1]]), str(le.classes_[best_n[i][-2]]),
             str(probs[i][best_n[i][-2]])))
         single_issue_predictions = [
             le.classes_[best_n[i][j]] for j in range(-1, -3, -1)
             if probs[i][best_n[i][j]] > threshold
         ]
         ml_predictions.append(single_issue_predictions)
     return ml_predictions
class TestLabelBot(unittest.TestCase):
    def setUp(self):
        self.df = DataFetcher()
        self.df.repo = "apache/incubator-mxnet"
        self.df.github_user = "******"
        self.df.github_oauth_token = "123"

    def tearDown(self):
        pass

    def test_cleanstr(self):
        new_string = self.df.cleanstr("a_b", "")
        self.assertEqual(new_string, "ab")

    def test_count_pages(self):
        with patch('DataFetcher.requests.get') as mocked_get:
            mocked_get.return_value.status_code = 200
            mocked_get.return_value.json.return_value = [{
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11925,
                "labels": [{
                    'name': 'Doc'
                }],
                "state":
                "open",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }, {
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11924,
                "labels": [],
                "state":
                "closed",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }]
            page = self.df.count_pages('all')
            self.assertEqual(page, 1)

    def test_fetch_issues(self):
        with patch('DataFetcher.requests.get') as mocked_get:
            mocked_get.return_value.status_code = 200
            mocked_get.return_value.json.return_value = {
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11925,
                "labels": [{
                    'name': 'Feature'
                }],
                "state":
                "open",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }
            data = self.df.fetch_issues([11925])
            expected_data = [{
                'id': "11925",
                'title': "issue's title",
                'body': "issue's body"
            }]
            assert_frame_equal(data, pd.DataFrame(expected_data))

    def test_data2json(self):
        with patch('DataFetcher.requests.get') as mocked_get:
            mocked_get.return_value.status_code = 200
            mocked_get.return_value.json.return_value = [{
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11925,
                "labels": [{
                    'name': 'Feature'
                }],
                "state":
                "open",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }, {
                "body":
                "issue's body",
                "created_at":
                "2018-07-28T18:27:17Z",
                "comments":
                "0",
                "number":
                11924,
                "labels": [],
                "state":
                "closed",
                "title":
                "issue's title",
                "html_url":
                "https://github.com/apache/incubator-mxnet/issues/11925",
            }]
            self.df.data2json('all', labels=["Feature"], other_labels=False)
            expected_data = [{
                'id': 11925,
                'title': "issue's title",
                'body': "issue's body",
                'labels': 'Feature'
            }]
            self.assertEqual(expected_data, self.df.json_data)