def strip_urls(urls, lang, strip_query_variables=False): language_stripper = LanguageStripper(languages=[lang], strip_query_variables=False) language_stripper_query = LanguageStripper(languages=[lang], strip_query_variables=True) language_stripper_nolang = LanguageStripper(strip_query_variables=True) stripped = defaultdict(set) for url in urls: stripped_url, success = language_stripper.strip_uri( url, expected_language=lang) if not success: # removes '/fr-FR/' and 'lang=FR' stripped_url, success = language_stripper.strip_uri(url) if not success: # removes 'clang=1' stripped_url, success = language_stripper_query.strip_uri(url) if not success: # removes '/en-en/fr-fr' stripped_url, success = language_stripper_nolang.strip_uri(url) if success: assert stripped_url != url stripped[stripped_url].add(url) return stripped
def strip_urls(urls, lang, strip_query_variables=False): language_stripper = LanguageStripper( languages=[lang], strip_query_variables=False) language_stripper_query = LanguageStripper( languages=[lang], strip_query_variables=True) language_stripper_nolang = LanguageStripper( strip_query_variables=True) stripped = defaultdict(set) for url in urls: stripped_url, success = language_stripper.strip_uri( url, expected_language=lang) if not success: # removes '/fr-FR/' and 'lang=FR' stripped_url, success = language_stripper.strip_uri(url) if not success: # removes 'clang=1' stripped_url, success = language_stripper_query.strip_uri(url) if not success: # removes '/en-en/fr-fr' stripped_url, success = language_stripper_nolang.strip_uri(url) if success: assert stripped_url != url stripped[stripped_url].add(url) return stripped
def runTest(self): language_stripper = LanguageStripper(languages=["fr"]) self.assertEqual(language_stripper.strip_uri("http://bla.com/fr/x"), ("http://bla.com/x", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/FR/x"), ("http://bla.com/x", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/x"), ("http://bla.com/x", True)) # remove multiple self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/fr"), ("http://bla.com", True)) # removing only the language identifier self.assertEqual(language_stripper.strip_uri("http://bla.com/lang-FR/x"), ("http://bla.com/lang/x", True)) # keep / at the end consistent self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr"), ("http://bla.com/x", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True)) # removing bits from them middle of a path part self.assertEqual( language_stripper.strip_uri("http://bla.com/lang-FR-foo/x"), ("http://bla.com/lang-foo/x", True) ) self.assertEqual( language_stripper.strip_uri("http://bla.com/lang-fr-FR-foo/x"), ("http://bla.com/lang-foo/x", True) ) # mixed languages language_stripper = LanguageStripper(languages=["fr", "en"]) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/"), ("http://bla.com/x/", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/fr/"), ("http://bla.com/x/", True))
def runTest(self): language_stripper = LanguageStripper(languages=["fr"]) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com?lang=1", False)) language_stripper = LanguageStripper(languages=["fr"], strip_query_variables=True) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/?clang=1"), ("http://bla.com/", True))
def runTest(self): language_stripper = LanguageStripper( languages=['fr']) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=fr'), ('http://bla.com', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=1'), ('http://bla.com?lang=1', False)) language_stripper = LanguageStripper( languages=['fr'], strip_query_variables=True) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=fr'), ('http://bla.com', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=1'), ('http://bla.com', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/?clang=1'), ('http://bla.com/', True))
def strip_urls(urls, lang=None): stripped = defaultdict(set) if lang is not None: language_stripper = LanguageStripper(languages=[lang]) for url in urls: stripped_url, success = language_stripper.strip_uri( url, expected_language=lang) if success: # if stripped_url in stripped: # print stripped_url, url, stripped[stripped_url] stripped[stripped_url].add(url) else: language_stripper = LanguageStripper(strip_query_variables=True) for url in urls: stripped_url, success = language_stripper.strip_uri(url) if stripped_url != url: stripped[stripped_url].add(url) stripped_url, success = language_stripper.strip_uri( url, remove_index=True) if stripped_url != url: stripped[stripped_url].add(url) return stripped
if candidates and uri in candidates: print_match(uri, uri, crawl, candidates) continue if not args.agressive: parsed_uri = urlparse.urlparse(uri) matched_languages = [language_stripper.match(parsed_uri.path), language_stripper.match(parsed_uri.query)] if args.lang not in matched_languages: # we removed a bit of the URL but is does not support our # hope to find args.lang, e.g. removed /fr/ when we were # looking for Italian pages. continue stripped_uri, success = language_stripper.strip_uri( uri, args.removeindex) print stripped_uri, success print k, v, tld, uri, crawl sys.exit() if candidates: if stripped_uri in candidates: print_match(stripped_uri, uri, crawl, candidates) continue else: if not success or stripped_uri == uri: continue try: sys.stdout.write("\t".join([stripped_uri, args.lang, line])) # line still has the newline
def runTest(self): language_stripper = LanguageStripper( languages=['fr']) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/fr/x'), ('http://bla.com/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/FR/x'), ('http://bla.com/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/fr-FR/x'), ('http://bla.com/x', True)) # remove multiple self.assertEqual(language_stripper.strip_uri( 'http://bla.com/fr-FR/fr'), ('http://bla.com', True)) # removing only the language identifier self.assertEqual(language_stripper.strip_uri( 'http://bla.com/lang-FR/x'), ('http://bla.com/lang/x', True)) # keep / at the end consistent self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/fr'), ('http://bla.com/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/fr/'), ('http://bla.com/x/', True)) # removing bits from them middle of a path part self.assertEqual(language_stripper.strip_uri( 'http://bla.com/lang-FR-foo/x'), ('http://bla.com/lang-foo/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/lang-fr-FR-foo/x'), ('http://bla.com/lang-foo/x', True)) # mixed languages language_stripper = LanguageStripper( languages=['fr', 'en']) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/fr/'), ('http://bla.com/x/', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/en/'), ('http://bla.com/x/', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/en/fr/'), ('http://bla.com/x/', True))