def weblinksIn(text, withoutBracketed=False, onlyBracketed=False): text = pywikibot.removeDisabledParts(text) # MediaWiki parses templates before parsing external links. Thus, there # might be a | or a } directly after a URL which does not belong to # the URL itself. # First, remove the curly braces of inner templates: nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}') while nestedTemplateR.search(text): text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text) # Then blow up the templates with spaces so that the | and }} will not # be regarded as part of the link:. templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}', re.DOTALL) while templateWithParamsR.search(text): text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) # Add <blank> at the end of a template # URL as last param of multiline template would not be correct text = text.replace('}}', ' }}') # Remove HTML comments in URLs as well as URLs in HTML comments. # Also remove text inside nowiki links etc. text = pywikibot.removeDisabledParts(text) linkR = pywikibot.compileLinkR(withoutBracketed, onlyBracketed) for m in linkR.finditer(text): if m.group('url'): yield m.group('url') else: yield m.group('urlb')
def weblinksIn(text, withoutBracketed = False, onlyBracketed = False): text = pywikibot.removeDisabledParts(text) # MediaWiki parses templates before parsing external links. Thus, there # might be a | or a } directly after a URL which does not belong to # the URL itself. # First, remove the curly braces of inner templates: nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}') while nestedTemplateR.search(text): text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text) # Then blow up the templates with spaces so that the | and }} will not be regarded as part of the link:. templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}', re.DOTALL) while templateWithParamsR.search(text): text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) linkR = pywikibot.compileLinkR(withoutBracketed, onlyBracketed) # Remove HTML comments in URLs as well as URLs in HTML comments. # Also remove text inside nowiki links etc. text = pywikibot.removeDisabledParts(text) for m in linkR.finditer(text): if m.group('url'): yield m.group('url') else: yield m.group('urlb')