예제 #1
0
def strip_urls(urls, lang, strip_query_variables=False):
    language_stripper = LanguageStripper(languages=[lang],
                                         strip_query_variables=False)
    language_stripper_query = LanguageStripper(languages=[lang],
                                               strip_query_variables=True)
    language_stripper_nolang = LanguageStripper(strip_query_variables=True)
    stripped = defaultdict(set)
    for url in urls:
        stripped_url, success = language_stripper.strip_uri(
            url, expected_language=lang)
        if not success:
            # removes '/fr-FR/' and 'lang=FR'
            stripped_url, success = language_stripper.strip_uri(url)
        if not success:
            # removes 'clang=1'
            stripped_url, success = language_stripper_query.strip_uri(url)
        if not success:
            # removes '/en-en/fr-fr'
            stripped_url, success = language_stripper_nolang.strip_uri(url)

        if success:
            assert stripped_url != url
            stripped[stripped_url].add(url)

    return stripped
def strip_urls(urls, lang, strip_query_variables=False):
    language_stripper = LanguageStripper(
        languages=[lang], strip_query_variables=False)
    language_stripper_query = LanguageStripper(
        languages=[lang], strip_query_variables=True)
    language_stripper_nolang = LanguageStripper(
        strip_query_variables=True)
    stripped = defaultdict(set)
    for url in urls:
        stripped_url, success = language_stripper.strip_uri(
            url, expected_language=lang)
        if not success:
            # removes '/fr-FR/' and 'lang=FR'
            stripped_url, success = language_stripper.strip_uri(url)
        if not success:
            # removes 'clang=1'
            stripped_url, success = language_stripper_query.strip_uri(url)
        if not success:
            # removes '/en-en/fr-fr'
            stripped_url, success = language_stripper_nolang.strip_uri(url)

        if success:
            assert stripped_url != url
            stripped[stripped_url].add(url)

    return stripped
    def runTest(self):
        language_stripper = LanguageStripper(languages=["fr"])

        self.assertEqual(language_stripper.strip_uri("http://bla.com/fr/x"), ("http://bla.com/x", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/FR/x"), ("http://bla.com/x", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/x"), ("http://bla.com/x", True))

        # remove multiple
        self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/fr"), ("http://bla.com", True))

        # removing only the language identifier
        self.assertEqual(language_stripper.strip_uri("http://bla.com/lang-FR/x"), ("http://bla.com/lang/x", True))

        # keep / at the end consistent
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr"), ("http://bla.com/x", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True))

        # removing bits from them middle of a path part
        self.assertEqual(
            language_stripper.strip_uri("http://bla.com/lang-FR-foo/x"), ("http://bla.com/lang-foo/x", True)
        )
        self.assertEqual(
            language_stripper.strip_uri("http://bla.com/lang-fr-FR-foo/x"), ("http://bla.com/lang-foo/x", True)
        )

        # mixed languages
        language_stripper = LanguageStripper(languages=["fr", "en"])
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/"), ("http://bla.com/x/", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/fr/"), ("http://bla.com/x/", True))
    def runTest(self):
        language_stripper = LanguageStripper(languages=["fr"])

        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com?lang=1", False))

        language_stripper = LanguageStripper(languages=["fr"], strip_query_variables=True)

        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/?clang=1"), ("http://bla.com/", True))
예제 #5
0
    def runTest(self):
        language_stripper = LanguageStripper(
            languages=['fr'])

        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=fr'), ('http://bla.com', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=1'), ('http://bla.com?lang=1', False))

        language_stripper = LanguageStripper(
            languages=['fr'], strip_query_variables=True)

        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=fr'), ('http://bla.com', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=1'), ('http://bla.com', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/?clang=1'), ('http://bla.com/', True))
예제 #6
0
def strip_urls(urls, lang=None):
    stripped = defaultdict(set)
    if lang is not None:
        language_stripper = LanguageStripper(languages=[lang])
        for url in urls:
            stripped_url, success = language_stripper.strip_uri(
                url, expected_language=lang)
            if success:
                # if stripped_url in stripped:
                #     print stripped_url, url, stripped[stripped_url]
                stripped[stripped_url].add(url)

    else:
        language_stripper = LanguageStripper(strip_query_variables=True)
        for url in urls:
            stripped_url, success = language_stripper.strip_uri(url)
            if stripped_url != url:
                stripped[stripped_url].add(url)

            stripped_url, success = language_stripper.strip_uri(
                url, remove_index=True)
            if stripped_url != url:
                stripped[stripped_url].add(url)
    return stripped
        if candidates and uri in candidates:
            print_match(uri, uri, crawl, candidates)
            continue

        if not args.agressive:
            parsed_uri = urlparse.urlparse(uri)
            matched_languages = [language_stripper.match(parsed_uri.path),
                                 language_stripper.match(parsed_uri.query)]

            if args.lang not in matched_languages:
                # we removed a bit of the URL but is does not support our
                # hope to find args.lang, e.g. removed /fr/ when we were
                # looking for Italian pages.
                continue

        stripped_uri, success = language_stripper.strip_uri(
            uri, args.removeindex)

        print stripped_uri, success
        print k, v, tld, uri, crawl
        sys.exit()

        if candidates:
            if stripped_uri in candidates:
                print_match(stripped_uri, uri, crawl, candidates)
                continue
        else:
            if not success or stripped_uri == uri:
                continue
            try:
                sys.stdout.write("\t".join([stripped_uri, args.lang, line]))
                # line still has the newline
예제 #8
0
    def runTest(self):
        language_stripper = LanguageStripper(
            languages=['fr'])

        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/fr/x'), ('http://bla.com/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/FR/x'), ('http://bla.com/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/fr-FR/x'), ('http://bla.com/x', True))

        # remove multiple
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/fr-FR/fr'), ('http://bla.com', True))

        # removing only the language identifier
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/lang-FR/x'), ('http://bla.com/lang/x', True))

        # keep / at the end consistent
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/fr'), ('http://bla.com/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/fr/'), ('http://bla.com/x/', True))

        # removing bits from them middle of a path part
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/lang-FR-foo/x'), ('http://bla.com/lang-foo/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/lang-fr-FR-foo/x'), ('http://bla.com/lang-foo/x', True))

        # mixed languages
        language_stripper = LanguageStripper(
            languages=['fr', 'en'])
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/fr/'), ('http://bla.com/x/', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/en/'), ('http://bla.com/x/', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/en/fr/'), ('http://bla.com/x/', True))