def calculate_similarity(self, query_file, data_file, filename, k=None): """Calculate the similarity between a query file and a data file. The results are written to a file named 'filename'.""" queries_set = doc.DocumentSet(query_file) documents_set = doc.DocumentSet(data_file) results = [] for query in queries_set.documents: results.extend(self._tfidf(query, documents_set)) output.write_output_file(filename, results)
def main(): """Calculates the basic word overlap similarity between qrys.txt & docs.txt. The results are written out to the file 'overlap.top'.""" query_file = 'data/qrys.txt' data_file = 'data/docs.txt' queries_set = doc.DocumentSet(query_file) documents_set = doc.DocumentSet(data_file) results = [] for query in queries_set.documents: results.extend(_calculate_overlap(query, documents_set.inverted_index)) # Output the overlaps. output.write_output_file('overlap.top', results)
def start_program(): account = input('Введите id аккаунта или ник ') try: vk.params['user_id'] = int(account) except ValueError: print("введен ник") vk.params['screen_name'] = account user_info = vk.get_user_info() print(user_info['response'][0]['id']) user_id = user_info['response'][0]['id'] users = vk.search_users(user_info) result = vk.compare_friends_groups(users) top10_users = s.find_top10(result) top10_users_with_photos = vk.find_top3_photos(top10_users) output = o.create_output_file(top10_users_with_photos) db.write_db_output(user_id, output) o.write_output_file(output) print("Программа завершена")
def calculate_similarity(self, query_file, data_file, filename): """Calculate the similarity between a query file and a data file. The results are written to a file named "filename".""" queries_set = doc.DocumentSet(query_file) documents_set = doc.DocumentSet(data_file) results = [] for query in queries_set.documents: # Compute the initial tfidfs. initial_tfidfs = self.tf_idf._tfidf(query, documents_set) # Select the top n_d scoring documents. initial_tfidfs = sorted([(-s, d) for (_, d, s) in initial_tfidfs]) initial_tfidfs = [(d, -s) for (s, d) in initial_tfidfs[:self.n_d]] selected_docs = [document for (document, _) in initial_tfidfs] # Combine the top documents into a 'mega document'. summed_counter = counter.Counter(query.words_counter) for document in selected_docs: summed_counter += document.words_counter mega_document = doc.document_from_dict(None, dict(summed_counter)) # Select the top n_w scoring words (via tf.idf) from the megadocument. word_scores = [] for word in sorted(list(mega_document.words_counter)): score = self.tf_idf._document_tfidf(word, mega_document, documents_set) word_scores.append((-score, word)) word_scores = sorted(word_scores)[:self.n_w] word_scores = [(word, -score) for (score, word) in word_scores] # Use these new words as the next query, and return the tf.idf scores. new_query = doc.document_from_dict(query.id, dict(word_scores)) results.extend(self.tf_idf._tfidf(new_query, documents_set)) output.write_output_file(filename, results)