Пример #1
0
def strip_urls(urls, lang, strip_query_variables=False):
    language_stripper = LanguageStripper(languages=[lang],
                                         strip_query_variables=False)
    language_stripper_query = LanguageStripper(languages=[lang],
                                               strip_query_variables=True)
    language_stripper_nolang = LanguageStripper(strip_query_variables=True)
    stripped = defaultdict(set)
    for url in urls:
        stripped_url, success = language_stripper.strip_uri(
            url, expected_language=lang)
        if not success:
            # removes '/fr-FR/' and 'lang=FR'
            stripped_url, success = language_stripper.strip_uri(url)
        if not success:
            # removes 'clang=1'
            stripped_url, success = language_stripper_query.strip_uri(url)
        if not success:
            # removes '/en-en/fr-fr'
            stripped_url, success = language_stripper_nolang.strip_uri(url)

        if success:
            assert stripped_url != url
            stripped[stripped_url].add(url)

    return stripped
def strip_urls(urls, lang, strip_query_variables=False):
    language_stripper = LanguageStripper(
        languages=[lang], strip_query_variables=False)
    language_stripper_query = LanguageStripper(
        languages=[lang], strip_query_variables=True)
    language_stripper_nolang = LanguageStripper(
        strip_query_variables=True)
    stripped = defaultdict(set)
    for url in urls:
        stripped_url, success = language_stripper.strip_uri(
            url, expected_language=lang)
        if not success:
            # removes '/fr-FR/' and 'lang=FR'
            stripped_url, success = language_stripper.strip_uri(url)
        if not success:
            # removes 'clang=1'
            stripped_url, success = language_stripper_query.strip_uri(url)
        if not success:
            # removes '/en-en/fr-fr'
            stripped_url, success = language_stripper_nolang.strip_uri(url)

        if success:
            assert stripped_url != url
            stripped[stripped_url].add(url)

    return stripped
Пример #3
0
    def runTest(self):
        language_stripper = LanguageStripper(
            languages=['fr'])

        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=fr'), ('http://bla.com', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=1'), ('http://bla.com?lang=1', False))

        language_stripper = LanguageStripper(
            languages=['fr'], strip_query_variables=True)

        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=fr'), ('http://bla.com', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com?lang=1'), ('http://bla.com', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/?clang=1'), ('http://bla.com/', True))
    def runTest(self):
        language_stripper = LanguageStripper(languages=["fr"])

        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com?lang=1", False))

        language_stripper = LanguageStripper(languages=["fr"], strip_query_variables=True)

        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=fr"), ("http://bla.com", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com?lang=1"), ("http://bla.com", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/?clang=1"), ("http://bla.com/", True))
Пример #5
0
def strip_urls(urls, lang=None):
    stripped = defaultdict(set)
    if lang is not None:
        language_stripper = LanguageStripper(languages=[lang])
        for url in urls:
            stripped_url, success = language_stripper.strip_uri(
                url, expected_language=lang)
            if success:
                # if stripped_url in stripped:
                #     print stripped_url, url, stripped[stripped_url]
                stripped[stripped_url].add(url)

    else:
        language_stripper = LanguageStripper(strip_query_variables=True)
        for url in urls:
            stripped_url, success = language_stripper.strip_uri(url)
            if stripped_url != url:
                stripped[stripped_url].add(url)

            stripped_url, success = language_stripper.strip_uri(
                url, remove_index=True)
            if stripped_url != url:
                stripped[stripped_url].add(url)
    return stripped
                source_url, target_url,
                source_page, target_page):
    print "\t".join([stripped_url,
                     source_url, target_url,
                     source_page, target_page])

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-lang', help='language codes')
    parser.add_argument('-candidates',
                        help='candidates from first pass',
                        type=argparse.FileType('r'))
    args = parser.parse_args(sys.argv[1:])

    language_stripper = LanguageStripper(languages=[args.lang])

    candidates = {}
    if args.candidates:
        candidates = read_candidates(args.candidates)

    for line in sys.stdin:
        line = line.decode('utf-8').split('\t')
        if not len(line) == 3:  # broken line
            continue
        page_url, href, link_text = line
        if not href.lower().endswith('.pdf'):  # broken
            continue

        try:
            joined_link = urlparse.urljoin(page_url, href)
                        help='candidates from url strippper',
                        type=argparse.FileType('r'))
    parser.add_argument('-nostrip', help='accept only exact matches',
                        action='store_true')
    parser.add_argument('-agressive', help='remove all locale info',
                        action='store_true')
    parser.add_argument('-removeindex',
                        help='remove /index.html at end of url',
                        action='store_true')
    args = parser.parse_args(sys.argv[1:])

    candidates = {}
    if args.candidates:
        candidates = read_candidates(args.candidates)

    language_stripper = LanguageStripper(languages=[args.lang])
    if args.agressive:
        language_stripper = LanguageStripper(strip_query_variables=True)

    for line in sys.stdin:
        split_line = line.rstrip().split('\t')
        k, v = split_line[-2:]
        tld, uri, crawl = k.split(' ')

        if len(split_line) == 2 and args.nostrip \
                and candidates and uri not in candidates:
            # We're matching candidates against a KV list without stripping,
            # i.e. target candidate is stripped and source candidate isn't
            # This allows for a cheap reject.
            continue
Пример #8
0
    def runTest(self):
        language_stripper = LanguageStripper(
            languages=['fr'])

        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/fr/x'), ('http://bla.com/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/FR/x'), ('http://bla.com/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/fr-FR/x'), ('http://bla.com/x', True))

        # remove multiple
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/fr-FR/fr'), ('http://bla.com', True))

        # removing only the language identifier
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/lang-FR/x'), ('http://bla.com/lang/x', True))

        # keep / at the end consistent
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/fr'), ('http://bla.com/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/fr/'), ('http://bla.com/x/', True))

        # removing bits from them middle of a path part
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/lang-FR-foo/x'), ('http://bla.com/lang-foo/x', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/lang-fr-FR-foo/x'), ('http://bla.com/lang-foo/x', True))

        # mixed languages
        language_stripper = LanguageStripper(
            languages=['fr', 'en'])
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/fr/'), ('http://bla.com/x/', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/en/'), ('http://bla.com/x/', True))
        self.assertEqual(language_stripper.strip_uri(
            'http://bla.com/x/en/fr/'), ('http://bla.com/x/', True))
    def runTest(self):
        language_stripper = LanguageStripper(languages=["fr"])

        self.assertEqual(language_stripper.strip_uri("http://bla.com/fr/x"), ("http://bla.com/x", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/FR/x"), ("http://bla.com/x", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/x"), ("http://bla.com/x", True))

        # remove multiple
        self.assertEqual(language_stripper.strip_uri("http://bla.com/fr-FR/fr"), ("http://bla.com", True))

        # removing only the language identifier
        self.assertEqual(language_stripper.strip_uri("http://bla.com/lang-FR/x"), ("http://bla.com/lang/x", True))

        # keep / at the end consistent
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr"), ("http://bla.com/x", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True))

        # removing bits from them middle of a path part
        self.assertEqual(
            language_stripper.strip_uri("http://bla.com/lang-FR-foo/x"), ("http://bla.com/lang-foo/x", True)
        )
        self.assertEqual(
            language_stripper.strip_uri("http://bla.com/lang-fr-FR-foo/x"), ("http://bla.com/lang-foo/x", True)
        )

        # mixed languages
        language_stripper = LanguageStripper(languages=["fr", "en"])
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/fr/"), ("http://bla.com/x/", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/"), ("http://bla.com/x/", True))
        self.assertEqual(language_stripper.strip_uri("http://bla.com/x/en/fr/"), ("http://bla.com/x/", True))