def _create_tf_idf_stack_vectors(self, only_questions=False): """ Create the tfidf vectors for the Stackexchange data. """ # Load question and answer corpus logging.info("Loading stack corpus and dictionary ...") question_corpus = self.stack_importer.get_question_corpus() answer_corpus = self.stack_importer.get_answer_corpus() corpus = question_corpus + answer_corpus dictionary = self.stack_importer.get_dictionary_from_corpora( [question_corpus, answer_corpus]) dict_size = len(dictionary) # Save stack dictionary stack_dict = {} for word_id, word in enumerate(dictionary.token2id): stack_dict[unicode(word)] = word_id self.idf_values = zeros(dict_size) logging.info("Determining question vectors ...") questions = StackCorpus(self.stack_importer.connection, "question") for question in questions: question_vector = zeros(dict_size) for word in question.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: question_vector[word_id] = self.tf_idf( word, word_id, question.body, corpus) self.question_vectors[question.id] = question_vector logging.info("\n\tDone.") if only_questions: # Skip the answers return stack_dict logging.info("Determining answer vectors ...") answers = StackCorpus(self.stack_importer.connection, "answer") for answer in answers: answer_vector = zeros(dict_size) for word in answer.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, answer.body, corpus) answer_vector[word_id] = tf_idf self.answer_vectors[answer.id] = answer_vector logging.info("\n\tDone.") return stack_dict
def calculate_similarities(self): """ Applies the ESA algorithm to the global stack data """ # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) #print "Has beer " + str(self.inverted_index.get(unicode("beer"), None)) logging.info("Calculating stack tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors() # For each question calculate similarity with each answer logging.info("\nCalculating questions-answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") for answer in answer_corpus: a_vector = self.get_esa_vector(answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone")
def calculate_similarities(self): # Open database connections self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() # Clean similarity table self.lda_importer.create_clean_similarities_table() self._learn_model() logging.info("Loading dictionary ...") self._load_dictionary() logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") # Get topics in the question bow = self.dictionary.doc2bow(question.body) question_topics = self.model[bow] for answer in answer_corpus: # Get topics in the answer bow = self.dictionary.doc2bow(answer.body) answer_topics = self.model[bow] # Similarities similarities.append( (question.id, answer.id, self._compare_documents(question_topics, answer_topics))) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.lda_importer.save_similarities(similarities) # Close database connections self.stack_importer.close_stack_db() self.lda_importer.close_lda_db()
def run_experiment_2_avg(self, experiment_type='2_avg', algorithm='lda_local_2'): self.experiments.open_experiment_db() self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers( ) # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[ question. id] = self.stack_importer.get_question_original_answers( question.id) similar_answers[ question. id] = self.lda_importer.load_similarities_for_question( question.id, -1, False) self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1, total_answers + 1): print "Calculating with limit " + str(limit) avg_precision, avg_recall = self.experiments.run_experiment_1_avg( number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting[ "experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone")
def run_experiment_3_avg(self, algorithm='esa', experiment_type='3_avg'): """ Similar to experiment_1, but checking users instead of answers """ self.experiments.open_experiment_db() self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() # Get the number of active users active_users = len(self.stack_importer.get_active_users()) # Get the users that gave an answer to each question asked_users = self.stack_importer.get_original_users() # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_users = {} original_users = {} for question in question_corpus: aux = asked_users.get(question.id, None) if aux is not None: original_users[question.id] = aux similar_users[ question. id] = self.esa_importer.load_similarities_for_question( question.id, -1, False) self.stack_importer.close_stack_db() self.esa_importer.close_esa_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1, active_users + 1): #print "Calculating with limit " + str(limit) logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_3_avg( asked_users, original_users, similar_users, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting[ "experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone")
def calculate_tf_idf_similarities(self): """Applies the TF-IDF algorithm to the global stack data""" # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_similarities_table() logging.info("Calculating stack tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors() # For each question calculate similarity with each answer question_corpus = StackCorpus(self.stack_importer.connection, "question") logging.info("\nCalculating questions-answers similarities ...") for question in question_corpus: q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") for answer in answer_corpus: a_vector = self.answer_vectors[answer.id] sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone")
def calculate_esa_similarities_to_users(self): # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) logging.info("Calculating questions tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors( only_questions=True) # For each question determine which other users would have been asked logging.info("Calculating questions/users similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") users = self.stack_importer.get_active_users() for question in question_corpus: print "Question " + str(question.id) q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) similarities = [] for user_id in users: user_body = self._create_user_tf_idf_stack_vector( user_id, stack_dictionary) u_vector = self.get_esa_vector(user_id, user_body, self.user_vectors[user_id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, u_vector) similarities.append((question.id, user_id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone")
def calculate_local_esa_similarities(self): """ Applies the ESA algorithm to the local stack data. This local data is measured per user. Returns the list of filtered users """ # Keep filtered users filtered_users = [] # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") #self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) # For each question calculate its similarity with all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions( user_id) # Only consider users with more than 5 answers if len(user_answers) > 5: print "User " + str(user_id) # Calculate tf_idf vectors for the given user self.question_vectors.clear() self.answer_vectors.clear() stack_dictionary = self._create_local_tf_idf_stack_vectors( user_id) q_vector = self.get_esa_vector( question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) for answer in user_answers: a_vector = self.get_esa_vector( answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") return filtered_users
def calculate_local_similarities(self): """ Calculates similarities between local questions/answers. Returns the list of filtered users """ # Keep filtered users filtered_users = [] # Open database connections self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() # Clean similarity table self.lda_importer.create_clean_similarities_table() # For each question calculate its similarity with the all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions( user_id) # Only consider users with more than 1 answer if len(user_answers) > 5: print "User " + str(user_id) self._learn_local_model(user_id) # Get topics in the question bow = self.dictionary.doc2bow(question.body) question_topics = self.model[bow] # Get topics in the answers and calculate similarities with current question for answer in user_answers: bow = self.dictionary.doc2bow(answer.body) answer_topics = self.model[bow] # Similarities similarities.append( (question.id, answer.id, self._compare_documents(question_topics, answer_topics))) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.lda_importer.save_similarities(similarities) # Close database connections self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() return filtered_users