def test_get_grouped_articles(self): """Test getting grouped articles from the database.""" self.assertEqual(database_reader.get_grouped_articles(), []) groups = classifier.group_articles(test_utils.SIMILAR_ARTICLES) database_writer.write_articles(test_utils.SIMILAR_ARTICLES) database_writer.write_groups(groups) self.assertEqual(database_reader.get_grouped_articles()[0], groups[0])
def group_articles(article_list=None, debug=False): """Group good articles in the database.""" if article_list is None: article_list = database_reader.get_ungrouped_articles() else: article_list = [ models.Article(url=a) if isinstance(a, (str, unicode)) else a for a in article_list ] groupings = database_reader.get_grouped_articles() no_keyword_grouping = None for index, article in enumerate(article_list): if debug: print "Grouping", index, "out of", len(article_list) if not article.get_keywords(): if no_keyword_grouping is None: # in_database is set to True here because we do not want a no keyword grouping in the database. no_keyword_grouping = models.Grouping(article, in_database=True) else: no_keyword_grouping.add_article(article) continue # Skip the article if the keywords cannot be gotten from it. best_grouping, best_grouping_similarity = None, 0 # Need to make a shallow copy of list for the possibility of combining two of the items in the list. for grouping in groupings[:]: similarity = grouping.best_similarity(article) if similarity > best_grouping_similarity: # If this article has a high similarity with two separate groups, then combine the groups. if best_grouping_similarity > constants.MIN_COMBINE_GROUP_PERCENTAGE: if best_grouping.in_database(): if grouping.in_database(): database_writer.remove_grouping_from_database( grouping) best_grouping.combine_group(grouping) groupings.remove(grouping) else: grouping.combine_group(best_grouping) groupings.remove(best_grouping) best_grouping = grouping best_grouping_similarity = similarity if best_grouping is not None and best_grouping_similarity > constants.MIN_GROUPING_PERCENTAGE: best_grouping.add_article(article) else: groupings.append(models.Grouping(article)) if no_keyword_grouping: groupings.append(no_keyword_grouping) return groupings
def write_overall_fits(grouping_list=None): """Write overall fits into the database.""" grouping_list = database_reader.get_grouped_articles( ) if grouping_list is None else grouping_list with database_utils.DatabaseConnection() as (connection, cursor): articles = [ article for grouping in grouping_list for article in grouping.get_articles() ] fits = models.calculate_fit(articles, max_iter=500) i = 1 for article, fit in fits: _print_status("fits", i, len(fits)) cursor.execute( "UPDATE article SET fit_x = ?, fit_y = ? WHERE link = ?", (fit[0], fit[1], article.get_url())) i += 1
def write_group_fits(grouping_list=None): """Write the group fits into the database.""" if grouping_list is None: group_ids = [ str(id) for id in database_reader.get_groups_with_unfit_articles() ] grouping_list = [ group for group in database_reader.get_grouped_articles() if group.get_uuid() in group_ids ] with database_utils.DatabaseConnection() as (connection, cursor): for i, grouping in enumerate(grouping_list): _print_status("group fits", i, len(grouping_list)) for article, fit in grouping.calculate_fit(): cursor.execute( "UPDATE article SET group_fit_x = ?, group_fit_y = ? WHERE link = ?", (fit[0], fit[1], article.get_url()))