예제 #1
0
 def test_get_grouped_articles(self):
     """Test getting grouped articles from the database."""
     self.assertEqual(database_reader.get_grouped_articles(), [])
     groups = classifier.group_articles(test_utils.SIMILAR_ARTICLES)
     database_writer.write_articles(test_utils.SIMILAR_ARTICLES)
     database_writer.write_groups(groups)
     self.assertEqual(database_reader.get_grouped_articles()[0], groups[0])
예제 #2
0
def group_articles(article_list=None, debug=False):
    """Group good articles in the database."""
    if article_list is None:
        article_list = database_reader.get_ungrouped_articles()
    else:
        article_list = [
            models.Article(url=a) if isinstance(a, (str, unicode)) else a
            for a in article_list
        ]
    groupings = database_reader.get_grouped_articles()
    no_keyword_grouping = None
    for index, article in enumerate(article_list):
        if debug:
            print "Grouping", index, "out of", len(article_list)
        if not article.get_keywords():
            if no_keyword_grouping is None:
                # in_database is set to True here because we do not want a no keyword grouping in the database.
                no_keyword_grouping = models.Grouping(article,
                                                      in_database=True)
            else:
                no_keyword_grouping.add_article(article)
            continue  # Skip the article if the keywords cannot be gotten from it.
        best_grouping, best_grouping_similarity = None, 0

        # Need to make a shallow copy of list for the possibility of combining two of the items in the list.
        for grouping in groupings[:]:
            similarity = grouping.best_similarity(article)
            if similarity > best_grouping_similarity:
                # If this article has a high similarity with two separate groups, then combine the groups.
                if best_grouping_similarity > constants.MIN_COMBINE_GROUP_PERCENTAGE:
                    if best_grouping.in_database():
                        if grouping.in_database():
                            database_writer.remove_grouping_from_database(
                                grouping)
                        best_grouping.combine_group(grouping)
                        groupings.remove(grouping)
                    else:
                        grouping.combine_group(best_grouping)
                        groupings.remove(best_grouping)
                best_grouping = grouping
                best_grouping_similarity = similarity
        if best_grouping is not None and best_grouping_similarity > constants.MIN_GROUPING_PERCENTAGE:
            best_grouping.add_article(article)
        else:
            groupings.append(models.Grouping(article))
    if no_keyword_grouping:
        groupings.append(no_keyword_grouping)
    return groupings
예제 #3
0
def write_overall_fits(grouping_list=None):
    """Write overall fits into the database."""
    grouping_list = database_reader.get_grouped_articles(
    ) if grouping_list is None else grouping_list
    with database_utils.DatabaseConnection() as (connection, cursor):
        articles = [
            article for grouping in grouping_list
            for article in grouping.get_articles()
        ]
        fits = models.calculate_fit(articles, max_iter=500)
        i = 1
        for article, fit in fits:
            _print_status("fits", i, len(fits))
            cursor.execute(
                "UPDATE article SET fit_x = ?, fit_y = ? WHERE link = ?",
                (fit[0], fit[1], article.get_url()))
            i += 1
예제 #4
0
def write_group_fits(grouping_list=None):
    """Write the group fits into the database."""
    if grouping_list is None:
        group_ids = [
            str(id) for id in database_reader.get_groups_with_unfit_articles()
        ]
        grouping_list = [
            group for group in database_reader.get_grouped_articles()
            if group.get_uuid() in group_ids
        ]
    with database_utils.DatabaseConnection() as (connection, cursor):
        for i, grouping in enumerate(grouping_list):
            _print_status("group fits", i, len(grouping_list))
            for article, fit in grouping.calculate_fit():
                cursor.execute(
                    "UPDATE article SET group_fit_x = ?, group_fit_y = ? WHERE link = ?",
                    (fit[0], fit[1], article.get_url()))