def process_one_log(self, input_log, repo_info_topics): input_log = PreprocessManager.remove_non_ascii(input_log) # TODO : Do we need repo info? #repo_info_topics = PreprocessManager.remove_non_ascii(repo_info_topics) # Find the length # TODO : All the scores which are dependent on the length are not unbiased if not normalized! Check that length = len(PreprocessManager.get_raw_tokenized_text(input_log)) # Find structural integrity. self.grammar_tool.enable_spellchecking() problematic_matches = self.grammar_tool.check(input_log) corrected_text = gc.correct(input_log, problematic_matches) degree_of_match = fuzz.ratio(input_log, corrected_text) structural_integrity_score = degree_of_match * (length - len(problematic_matches)) # Check if topic is relevant # This is still in testing phase and not sure if it has a good impact on the final results. # Might be totally useless at times. sframe_data_for_topics = gl.SArray([PreprocessManager.get_word_counts(input_log)]) # Add Associations here TODO: Make it proper associations = gl.SFrame({'word': ['fix', 'issue', 'implement', 'modify', 'changed', 'bug', 'error'], 'topic': [0, 0, 0, 0, 0, 0, 0]}) topic_model = gl.topic_model.create(sframe_data_for_topics, associations=associations) # TODO : Add here the match with the description. Is that useful? Maybe Future work? #pred = topic_model.predict(sframe_data_for_topics, output_type='probability') topics = topic_model.get_topics() # The final score is the sum of all the topic 0 scores! As they were used in associations. Gives us relevance of being a commit message! topic_relevance_score = 0 for i in xrange(0, len(topics)): curr = topics[i] topic_id = curr['topic'] score_val = curr['score'] if topic_id == 0: topic_relevance_score += score_val topic_relevance_score *= 100 #print topics, topic_relevance_score # Check how much positivity log_dict = dict() log_dict['text'] = input_log positivity = self.senti_checker.predict_row(log_dict) positivity_score = 100 * positivity #print positivity_score # Spelling Goodness self.spell_master.set_text(input_log) error_words = list() for err in self.spell_master: error_words.append(err.word) spelling_integrity_score = length - len(error_words) #return all return length, structural_integrity_score, topic_relevance_score, positivity_score, spelling_integrity_score