def strip_urls(urls, lang, strip_query_variables=False): language_stripper = LanguageStripper(languages=[lang], strip_query_variables=False) language_stripper_query = LanguageStripper(languages=[lang], strip_query_variables=True) language_stripper_nolang = LanguageStripper(strip_query_variables=True) stripped = defaultdict(set) for url in urls: stripped_url, success = language_stripper.strip_uri( url, expected_language=lang) if not success: # removes '/fr-FR/' and 'lang=FR' stripped_url, success = language_stripper.strip_uri(url) if not success: # removes 'clang=1' stripped_url, success = language_stripper_query.strip_uri(url) if not success: # removes '/en-en/fr-fr' stripped_url, success = language_stripper_nolang.strip_uri(url) if success: assert stripped_url != url stripped[stripped_url].add(url) return stripped
def strip_urls(urls, lang, strip_query_variables=False): language_stripper = LanguageStripper( languages=[lang], strip_query_variables=False) language_stripper_query = LanguageStripper( languages=[lang], strip_query_variables=True) language_stripper_nolang = LanguageStripper( strip_query_variables=True) stripped = defaultdict(set) for url in urls: stripped_url, success = language_stripper.strip_uri( url, expected_language=lang) if not success: # removes '/fr-FR/' and 'lang=FR' stripped_url, success = language_stripper.strip_uri(url) if not success: # removes 'clang=1' stripped_url, success = language_stripper_query.strip_uri(url) if not success: # removes '/en-en/fr-fr' stripped_url, success = language_stripper_nolang.strip_uri(url) if success: assert stripped_url != url stripped[stripped_url].add(url) return stripped
def runTest(self): language_stripper = LanguageStripper( languages=['fr']) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=fr'), ('http://bla.com', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=1'), ('http://bla.com?lang=1', False)) language_stripper = LanguageStripper( languages=['fr'], strip_query_variables=True) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=fr'), ('http://bla.com', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com?lang=1'), ('http://bla.com', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/?clang=1'), ('http://bla.com/', True))
def runTest(self): language_stripper = LanguageStripper(languages=["fr"]) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com?lang=1", False)) language_stripper = LanguageStripper(languages=["fr"], strip_query_variables=True) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/?clang=1"), ("http://bla.com/", True))
def strip_urls(urls, lang=None): stripped = defaultdict(set) if lang is not None: language_stripper = LanguageStripper(languages=[lang]) for url in urls: stripped_url, success = language_stripper.strip_uri( url, expected_language=lang) if success: # if stripped_url in stripped: # print stripped_url, url, stripped[stripped_url] stripped[stripped_url].add(url) else: language_stripper = LanguageStripper(strip_query_variables=True) for url in urls: stripped_url, success = language_stripper.strip_uri(url) if stripped_url != url: stripped[stripped_url].add(url) stripped_url, success = language_stripper.strip_uri( url, remove_index=True) if stripped_url != url: stripped[stripped_url].add(url) return stripped
source_url, target_url, source_page, target_page): print "\t".join([stripped_url, source_url, target_url, source_page, target_page]) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('-lang', help='language codes') parser.add_argument('-candidates', help='candidates from first pass', type=argparse.FileType('r')) args = parser.parse_args(sys.argv[1:]) language_stripper = LanguageStripper(languages=[args.lang]) candidates = {} if args.candidates: candidates = read_candidates(args.candidates) for line in sys.stdin: line = line.decode('utf-8').split('\t') if not len(line) == 3: # broken line continue page_url, href, link_text = line if not href.lower().endswith('.pdf'): # broken continue try: joined_link = urlparse.urljoin(page_url, href)
help='candidates from url strippper', type=argparse.FileType('r')) parser.add_argument('-nostrip', help='accept only exact matches', action='store_true') parser.add_argument('-agressive', help='remove all locale info', action='store_true') parser.add_argument('-removeindex', help='remove /index.html at end of url', action='store_true') args = parser.parse_args(sys.argv[1:]) candidates = {} if args.candidates: candidates = read_candidates(args.candidates) language_stripper = LanguageStripper(languages=[args.lang]) if args.agressive: language_stripper = LanguageStripper(strip_query_variables=True) for line in sys.stdin: split_line = line.rstrip().split('\t') k, v = split_line[-2:] tld, uri, crawl = k.split(' ') if len(split_line) == 2 and args.nostrip \ and candidates and uri not in candidates: # We're matching candidates against a KV list without stripping, # i.e. target candidate is stripped and source candidate isn't # This allows for a cheap reject. continue
def runTest(self): language_stripper = LanguageStripper( languages=['fr']) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/fr/x'), ('http://bla.com/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/FR/x'), ('http://bla.com/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/fr-FR/x'), ('http://bla.com/x', True)) # remove multiple self.assertEqual(language_stripper.strip_uri( 'http://bla.com/fr-FR/fr'), ('http://bla.com', True)) # removing only the language identifier self.assertEqual(language_stripper.strip_uri( 'http://bla.com/lang-FR/x'), ('http://bla.com/lang/x', True)) # keep / at the end consistent self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/fr'), ('http://bla.com/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/fr/'), ('http://bla.com/x/', True)) # removing bits from them middle of a path part self.assertEqual(language_stripper.strip_uri( 'http://bla.com/lang-FR-foo/x'), ('http://bla.com/lang-foo/x', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/lang-fr-FR-foo/x'), ('http://bla.com/lang-foo/x', True)) # mixed languages language_stripper = LanguageStripper( languages=['fr', 'en']) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/fr/'), ('http://bla.com/x/', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/en/'), ('http://bla.com/x/', True)) self.assertEqual(language_stripper.strip_uri( 'http://bla.com/x/en/fr/'), ('http://bla.com/x/', True))
def runTest(self): language_stripper = LanguageStripper(languages=["fr"]) self.assertEqual(language_stripper.strip_uri("http://bla.com/fr/x"), ("http://bla.com/x", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/FR/x"), ("http://bla.com/x", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/x"), ("http://bla.com/x", True)) # remove multiple self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/fr"), ("http://bla.com", True)) # removing only the language identifier self.assertEqual(language_stripper.strip_uri("http://bla.com/lang-FR/x"), ("http://bla.com/lang/x", True)) # keep / at the end consistent self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr"), ("http://bla.com/x", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True)) # removing bits from them middle of a path part self.assertEqual( language_stripper.strip_uri("http://bla.com/lang-FR-foo/x"), ("http://bla.com/lang-foo/x", True) ) self.assertEqual( language_stripper.strip_uri("http://bla.com/lang-fr-FR-foo/x"), ("http://bla.com/lang-foo/x", True) ) # mixed languages language_stripper = LanguageStripper(languages=["fr", "en"]) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/"), ("http://bla.com/x/", True)) self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/fr/"), ("http://bla.com/x/", True))