def extract_features(self, x):
        """
        prepares all features and returns them as an array
        """
        length_data = x['comment'].apply(lambda c: len(c.split())).to_numpy()
        stopwords_num = x['comment'].apply(
            FeatureHelper.get_stop_words_num).to_numpy()
        functional_types = x['functional_type'].to_numpy()
        x['comment'] = x['comment'].apply(DataProcesser.remove_stopwords)
        code_comment_similarity = x.apply(
            lambda row: TextSimilarity.get_similarity_score(
                s1=DataProcesser.preprocess(row['comment']),
                s2=DataProcesser.preprocess(row['code']),
                type='JACC'),
            axis=1).to_numpy()
        code_comment_similarity_cosine = x.apply(
            lambda row: TextSimilarity.get_similarity_score(
                s1=DataProcesser.preprocess(row['comment']),
                s2=DataProcesser.preprocess(row['code']),
                type='COSINE_TFIDF'),
            axis=1).to_numpy()

        comment = x['comment'].to_numpy()
        features = [
            length_data, stopwords_num, code_comment_similarity_cosine,
            functional_types, comment
        ]
        return features
 def test_extract_snake_case(self):
     dp = DataProcesser()
     expected = 'snake case'
     received = dp.extract_snake_case('snake_case')
     self.assertEqual(expected, received)
     # test multiple snake case
     expected = 'snake case oh yea'
     received = dp.extract_snake_case('snake_case_oh_yea')
     self.assertEqual(expected, received)
 def test_extract_camel_case(self):
     dp = DataProcesser()
     expected = 'Camel Case'
     received = dp.extract_camel_case('CamelCase')
     self.assertEqual(expected, received)
     # test multiple camel case
     expected = 'Camel Case Camel Case'
     received = dp.extract_camel_case('CamelCaseCamelCase')
     self.assertEqual(expected, received)
     # test non-camel case
     expected = 'Camel Case'
     received = dp.extract_camel_case('Camel Case')
     self.assertEqual(expected, received)
 def test_remove_java_tags(self):
     dp = DataProcesser()
     # test basic tag removal
     text = '@author this'
     expected = 'this'
     result = dp.remove_java_tags(text).lstrip()
     self.assertEqual(expected, result)
     # test tag removal based on regexp
     text = '{@link szdsdzsdz} this'
     expected = 'this'
     result = dp.remove_java_tags(text).lstrip()
     self.assertEqual(expected, result)
     # test no tag removal
     text = 'as this'
     expected = 'as this'
     result = dp.remove_java_tags(text).lstrip()
     self.assertEqual(expected, result)
def is_line_java_keyword(code_line: str) -> bool:
    """
       returns True if the comment contains only a Java keyword
    """
    code_line = DataProcesser.remove_special_characters(code_line)
    for keyword in JAVA_KEYWORDS:
        if code_line == keyword:
            return True
    return False
def is_invalid_code(code_line: str) -> bool:
    """
    returns True if the code line is empty, is a comment,
    contains only a Java tag, only a Java keyword or only a special character
    """
    is_com = is_line_comment(code_line)
    is_tag = is_line_java_tag(code_line)
    code_line = DataProcesser.remove_special_characters(code_line)
    code_line = code_line.replace(" ", "")
    return (code_line.isspace() or is_com or \
            is_tag or is_line_java_keyword(code_line) or is_only_special_char(code_line))
def is_only_special_char(code_line: str) -> str:
    code_line = code_line.replace(" ", ",")
    code_line = DataProcesser.remove_special_characters(code_line)
    return code_line == ''
 def test_preprocess(self):
     dp = DataProcesser()
     data = '@Override this'
     result = dp.preprocess(data)
     self.assertEqual('', result)