Пример #1
0
def create_matches_wanted_galleries_from_providers_internal(
        wanted_galleries: QuerySet,
        provider_filter: str = '', cutoff: float = 0.4, max_matches: int = 20,
        must_be_used: bool = False
) -> None:

    try:
        galleries_title_id = []

        if provider_filter:
            galleries = Gallery.objects.eligible_for_use(provider__contains=provider_filter)
        else:
            galleries = Gallery.objects.eligible_for_use()

        if must_be_used:
            galleries = galleries.filter(
                Q(archive__isnull=False)
                | (Q(gallery_container__isnull=False) & Q(gallery_container__archive__isnull=False))
                | (Q(magazine__isnull=False) & Q(magazine__archive__isnull=False))
            )

        for gallery in galleries:
            if gallery.title:
                galleries_title_id.append(
                    (clean_title(gallery.title), gallery.pk))
            if gallery.title_jpn:
                galleries_title_id.append(
                    (clean_title(gallery.title_jpn), gallery.pk))

        logger.info(
            "Trying to match against gallery database, "
            "{} wanted galleries with no match. Provider filter: {}".format(
                wanted_galleries.count(),
                provider_filter
            )
        )
        for wanted_gallery in wanted_galleries:

            similar_list = get_list_closer_gallery_titles_from_list(
                wanted_gallery.search_title, galleries_title_id, cutoff, max_matches)

            if similar_list is not None:

                logger.info("Found {} matches from title for {}".format(len(similar_list), wanted_gallery.search_title))
                for similar in similar_list:
                    # We filter here instead of the Gallery model, because we use the same list for every WG.
                    if not FoundGallery.objects.filter(wanted_gallery=wanted_gallery, gallery_id=similar[1]):
                        GalleryMatch.objects.get_or_create(
                            wanted_gallery=wanted_gallery,
                            gallery_id=similar[1],
                            defaults={'match_accuracy': similar[2]})

        logger.info("Matching ended")
        return
    except BaseException:
        logger.critical(traceback.format_exc())
Пример #2
0
    def test_repeated_archives(self):

        title_to_check = 'my title'
        galleries_title_id = [(gallery.title, gallery.pk) for gallery in Gallery.objects.all()]
        cutoff = 0.4
        max_matches = 10

        similar_list = get_list_closer_gallery_titles_from_list(
            title_to_check, galleries_title_id, cutoff, max_matches)

        self.assertIsNone(similar_list)

        title_to_check_2 = 'public gallery 1'

        similar_list = get_list_closer_gallery_titles_from_list(
            title_to_check_2, galleries_title_id, cutoff, max_matches)

        self.assertIsNotNone(similar_list)
        self.assertEqual(len(similar_list), 2)
        self.assertEqual(similar_list[0][0], 'sample non public gallery 1')
        self.assertEqual(similar_list[0][2], 0.7441860465116279)
Пример #3
0
def match_internal(archives: ArchiveQuerySet,
                   providers: Iterable[str],
                   logger: OptionalLogger,
                   cutoff: float = 0.4,
                   max_matches: int = 20,
                   match_by_filesize: bool = True) -> None:

    galleries_per_provider: Dict[str, GalleryQuerySet] = {}
    galleries_title_id_per_provider: Dict[str, List[Tuple[str, str]]] = {}

    if providers:
        for provider in providers:
            galleries_per_provider[
                provider] = Gallery.objects.eligible_for_use(
                    provider__contains=provider)
    else:
        galleries_per_provider['all'] = Gallery.objects.eligible_for_use()

    for provider, galleries in galleries_per_provider.items():
        galleries_title_id_per_provider[provider] = list()
        for gallery in galleries:
            if gallery.title:
                galleries_title_id_per_provider[provider].append(
                    (replace_illegal_name(gallery.title), gallery.pk))
            if gallery.title_jpn:
                galleries_title_id_per_provider[provider].append(
                    (replace_illegal_name(gallery.title_jpn), gallery.pk))

    for i, archive in enumerate(archives, start=1):  # type: ignore

        for provider, galleries_title_id in galleries_title_id_per_provider.items(
        ):

            if provider != 'all':
                matchers = crawler_settings.provider_context.get_matchers(
                    crawler_settings,
                    logger,
                    filter_name="{}_title".format(provider),
                    force=True)
                if matchers:
                    adj_title = matchers[0][0].format_to_compare_title(
                        archive.zipped.name)
                else:
                    adj_title = get_title_from_path(archive.zipped.name)
            else:
                adj_title = get_title_from_path(archive.zipped.name)
            similar_list_provider = get_list_closer_gallery_titles_from_list(
                adj_title, galleries_title_id, cutoff, max_matches)

            if similar_list_provider is not None:

                for similar in similar_list_provider:
                    gallery = Gallery.objects.get(pk=similar[1])

                    ArchiveMatches.objects.update_or_create(
                        archive=archive,
                        gallery=gallery,
                        match_type='title',
                        match_accuracy=similar[2])

                if logger:
                    logger.info(
                        "{} of {}: Found {} matches (internal search) from title for archive: {}, using provider filter: {}"
                        .format(i, archives.count(),
                                len(similar_list_provider), archive.title,
                                provider))

        if not match_by_filesize or archive.filesize <= 0:
            continue
        galleries_same_size = Gallery.objects.filter(filesize=archive.filesize)
        if galleries_same_size.exists():

            if logger:
                logger.info(
                    "{} of {}: Found {} matches (internal search) from filesize for archive: {}"
                    .format(i, str(archives.count()),
                            str(galleries_same_size.count()), archive.title))
            for similar_gallery in galleries_same_size:
                gallery = Gallery.objects.get(pk=similar_gallery.pk)

                ArchiveMatches.objects.update_or_create(archive=archive,
                                                        gallery=gallery,
                                                        match_type='size',
                                                        match_accuracy=1)
Пример #4
0
def match_archives_from_gallery_titles(archives: ArchiveQuerySet,
                                       logger: OptionalLogger = None,
                                       cutoff: float = 0.4,
                                       max_matches: int = 20,
                                       provider: str = '') -> None:

    try:
        if not archives:
            non_match_archives = Archive.objects.filter(match_type='non-match')
        else:
            non_match_archives = archives

        if non_match_archives:

            galleries_title_id = []

            if provider:
                galleries = Gallery.objects.eligible_for_use(
                    provider__contains=provider)
            else:
                galleries = Gallery.objects.eligible_for_use()
            for gallery in galleries:
                if gallery.title:
                    galleries_title_id.append(
                        (replace_illegal_name(gallery.title), gallery.pk))
                if gallery.title_jpn:
                    galleries_title_id.append(
                        (replace_illegal_name(gallery.title_jpn), gallery.pk))

            if logger:
                logger.info("Trying to match against gallery database, "
                            "{} archives with no match, matching against: {}, "
                            "number of galleries: {}, cutoff: {}".format(
                                non_match_archives.count(), provider,
                                galleries.count(), cutoff))
            for i, archive in enumerate(non_match_archives, start=1):

                matchers = crawler_settings.provider_context.get_matchers(
                    crawler_settings,
                    logger,
                    filter_name="{}_title".format(provider),
                    force=True)

                if matchers:
                    adj_title = matchers[0][0].format_to_compare_title(
                        archive.zipped.name)
                else:
                    adj_title = get_title_from_path(archive.zipped.name)
                similar_list = get_list_closer_gallery_titles_from_list(
                    adj_title, galleries_title_id, cutoff, max_matches)

                if similar_list is not None:

                    archive.possible_matches.clear()

                    if logger:
                        logger.info(
                            "{} of {}: Found {} matches from title for {}".
                            format(i, non_match_archives.count(),
                                   len(similar_list), archive.zipped.name))
                    for similar in similar_list:
                        gallery = Gallery.objects.get(pk=similar[1])

                        ArchiveMatches.objects.create(
                            archive=archive,
                            gallery=gallery,
                            match_type='title',
                            match_accuracy=similar[2])

                if archive.filesize <= 0:
                    continue
                galleries_same_size = Gallery.objects.filter(
                    filesize=archive.filesize)
                if galleries_same_size.exists():

                    if logger:
                        logger.info(
                            "{} of {}: Found {} matches from filesize for {}".
                            format(i, str(non_match_archives.count()),
                                   str(galleries_same_size.count()),
                                   archive.zipped.name))
                    for similar_gallery in galleries_same_size:
                        gallery = Gallery.objects.get(pk=similar_gallery.pk)

                        ArchiveMatches.objects.create(archive=archive,
                                                      gallery=gallery,
                                                      match_type='size',
                                                      match_accuracy=1)

        if logger:
            logger.info("Matching ended")
        return
    except BaseException:
        thread_logger = logging.getLogger('viewer.threads')
        thread_logger.error(traceback.format_exc())