def doFillAbbrevs(scrapeLimit: Optional[int] = None) -> None: """Fill empty abbreviations in some automatizable cases. Currently the cases are: * abbreviation is equal to title, possibly without articles (a/the) """ catName = 'Category:Infobox journals with missing ISO 4 abbreviations' cat = pywikibot.Category(Site(), catName) articles = cat.articles(namespaces=0, total=scrapeLimit, content=True) for n, page in enumerate(articles): print(f'--Scraping:\t{n}:\t[[{page.title()}]]', flush=True) for i, infobox in enumerate(getInfoboxJournals(page)): if infobox.get('abbreviation', '') != '': print('--Skipping infobox that actually has non-empty abbrev') continue title = abbrevUtils.stripTitle(page.title()) if 'title' in infobox and infobox['title'] != title: print('--Skipping infobox with different title than article', infobox['title']) continue cLang = abbrevUtils.getLanguage(infobox) cAbbrev = state.tryGetAbbrev(title, cLang) if cAbbrev is None: continue # If abbreviation is equal to title, up to "a/the" articles: if cAbbrev == re.sub(r'(The|the|A|a)\s+', '', title): print('--Filling "{}" with abbrev "{}"'.format(title, cAbbrev)) trySaving(page, fillAbbreviation(page.text, i, cAbbrev), 'Filling trivial ISO-4 abbreviation. ', overwrite=True)
def addOmicsHatnote(aTitle: str, title: str, publisher: str) -> None: """Add hatnote to [[aTitle]] about confusion risk with OMICS [[title]].""" page = pywikibot.Page(Site(), aTitle) if '{{Confused|' in page.text or '{{confused|' in page.text: print(f'Skip: {{{{confused}}}} hatnote already on [[{aTitle}]]') return print(f'Adding hatnote to [[{aTitle}]]') hatnote = (f'{{{{Confused|text=[[{title}]],' f' published by the [[{publisher}]]}}}}\n') trySaving(page, hatnote + page.text, overwrite=True, limitType='hatnote', summary='Add hatnote to predatory journal clone.')
def makeAmpersandRedirects(pageTitle: str, foreign: Set[str], targetPageTitle: Optional[str] = None, andToAmpersand: bool = True, ampersandToAnd: bool = True) -> bool: """If pageTitle contains 'and'/'&', try creating redirect from '&'/'and'. `foreign` is a set of foreign-language titles to avoid. Return whether any edits made. """ if len(pageTitle) > 95: print('Skipping (length): ', pageTitle) return False if not targetPageTitle: targetPageTitle = pageTitle rTitle = '' if ' and ' in pageTitle and andToAmpersand: rTitle = pageTitle.replace(' and ', ' & ') rTitle = rTitle.replace(', & ', ' & ') if ' & ' in pageTitle and ampersandToAnd: rTitle = pageTitle.replace(' & ', ' and ') # Exclude possibly-foreign titles based on categories and # on language detection. if pageTitle in foreign: print('Skipping (lang category): ', pageTitle) return False if not EnglishWordList.check(pageTitle): isReliable, _, details = \ pycld2.detect(pageTitle, isPlainText=True) if not isReliable or details[0][0] != 'ENGLISH': print('Skipping (lang detect): ', pageTitle) print(isReliable, str(details)) return False if not rTitle: return False # Try creating a redirect from rTitle to pageTitle. rPage = pywikibot.Page(Site(), rTitle) # Skip if the page already exists. if rPage.exists(): print('Skipping (already exists): ', rTitle) return False # Create the redirect. print(f'Creating redirect from [[{rTitle}]] to [[{targetPageTitle}]]') rNewContent = (f'#REDIRECT [[{targetPageTitle}]]\n' f'{{{{R from modification}}}}\n') summary = 'Redirect between ampersand/and variant.' return trySaving(rPage, rNewContent, summary, overwrite=False)
def makeVariantRedirect(vTitle: str, targetArticle: str) -> bool: """Try creating a redirect from vTitle to targetArticle.""" rPage = pywikibot.Page(Site(), vTitle) # Skip if the page already exists. if rPage.exists(): print('Skipping variant (already exists): ', vTitle) return False # Create the redirect. print(f'Creating redirect from [[{vTitle}]] to [[{targetArticle}]]') # Check number of results in Google search: only possible for <100 request. # sleepTime = 15 # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' # 'AppleWebKit/537.36 (KHTML, like Gecko) ' # 'Chrome/60.0.3112.113 Safari/537.36'} # url = 'https://www.google.com/search?' # url += urllib.parse.urlencode({'q': '"' + vTitle + '"'}) # while True: # try: # sleep(sleepTime) # req = urllib.request.Request(url, headers=headers) # with urllib.request.urlopen(req) as response: # html = str(response.read()) # if 'No results found' in html: # print('No Results') # return False # regex = r'([0-9]+),?\s*([0-9]+),?\s*([0-9]*)\s*results' # m = re.search(regex, html) # if not m: # print('no Results') # return False # res = m.group(1) + m.group(2) + m.group(3) # print('Results=', res) # if int(res) < 5: # return False # break # except urllib.error.URLError as err: # print('Exception: ', sys.exc_info()[0], '\n', err.reason) # sleepTime *= 2 # print('sleep=', sleepTime, flush=True) rNewContent = '#REDIRECT [[' + targetArticle + ']]\n' rNewContent += '{{R from abbreviation}}\n' summary = 'Redirect from variant abbreviation.' return utils.trySaving(rPage, rNewContent, summary, overwrite=False)
def fixPageRedirects(page: pywikibot.Page) -> int: """Fix redirects to given page.""" title = page.title() pageData = state.getPageData(title) (requiredRedirects, skip) = getRequiredRedirects(page) nEditedPages = 0 for rTitle, rCats in requiredRedirects.items(): rNewContent = rcatSetToRedirectContent(title, rCats) # Attempt to create new redirect. if rTitle not in pageData['redirects']: try: exists = pywikibot.Page(Site(), rTitle).exists() except pywikibot.exceptions.InvalidTitle: exists = False if exists: print(f'--Skipping existing page [[{rTitle}]] ' f'(not a redirect to [[{title}]]).') if title == rTitle: continue if title not in pywikibot.Page(Site(), rTitle).text: reports.reportExistingOtherPage(title, rTitle) else: print(f'--Creating redirect ' f'from [[{rTitle}]] to [[{title}]]. ' f'Created content:\n{rNewContent}\n-----', flush=True) nEditedPages += 1 rPage = pywikibot.Page(Site(), rTitle) trySaving(rPage, rNewContent, 'Creating redirect from standard abbreviation. ', overwrite=False) else: rOldContent = pageData['redirects'][rTitle] if isValidISO4Redirect(rOldContent, title, rCats): print(f'--Skipping existing valid redirect ' f'from [[{rTitle}]] to [[{title}]].') elif isReplaceableRedirect(rOldContent, title, rCats | RCatSet.ISO4): # Don't log nor edit redirects that would be replaceable # except they have ISO4 and we're not sure it should have. if not (rCats & RCatSet.ISO4): continue print(f'--Replacing existing redirect ' f'from [[{rTitle}]] to [[{title}]].\n' f'RCatSet: {rCats}\n' f'Original content:\n{rOldContent}\n----- ' f'New content:\n{rNewContent}\n-----', flush=True) nEditedPages += 1 rPage = pywikibot.Page(Site(), rTitle) trySaving(rPage, rNewContent, 'Marking standard abbrev rcat. ', overwrite=True) elif not skip: print(f'--Skipping existing dubious redirect ' f'from [[{rTitle}]] to [[{title}]].\n' f'RCatSet: {rCats}\n' f'Original content:\n{rOldContent}\n----- ') reports.reportExistingOtherRedirect(title, rTitle, rOldContent) # Purge page cache to remove warnings about missing redirects. if nEditedPages > 0: tryPurging(page) # Report redirects that we wouldn't add, but exist and are marked as ISO-4. if requiredRedirects and not skip: expectedAbbrevs = \ [r.replace('.', '') for r in requiredRedirects] potentialAbbrevs = [] for rTitle, rContent in pageData['redirects'].items(): if 'from former name' in rContent or '.' not in rTitle: cAbbrevEng = state.tryGetAbbrev( abbrevUtils.stripTitle(rTitle), 'eng') or '' cAbbrevAll = state.tryGetAbbrev( abbrevUtils.stripTitle(rTitle), 'all') or '' cAbbrevEng = cAbbrevEng.replace('.', '') cAbbrevAll = cAbbrevAll.replace('.', '') if 'from former name' in rContent: if cAbbrevEng != rTitle.replace('.', ''): expectedAbbrevs.append(cAbbrevEng) if cAbbrevAll != rTitle.replace('.', ''): expectedAbbrevs.append(cAbbrevAll) elif '.' not in rTitle: if cAbbrevEng != rTitle.replace('.', ''): potentialAbbrevs.append((cAbbrevEng, rTitle)) if cAbbrevAll != rTitle.replace('.', ''): potentialAbbrevs.append((cAbbrevAll, rTitle)) expectedAbbrevs = [a for a in expectedAbbrevs if a] potentialAbbrevs = [(a, t) for (a, t) in potentialAbbrevs if a] for rTitle, rContent in pageData['redirects'].items(): if not re.search(r'R from ISO 4', rContent): continue # Ignore rTitle that contain a computed abbreviation as a # substring, assume that it's some valid variation on a subtitle. isExpected = False rTitleDotless = rTitle.replace('.', '') for computedAbbrev in expectedAbbrevs: if re.sub(r'\s*[:(].*', '', computedAbbrev) in rTitleDotless: isExpected = True break if not isExpected: # Find other titles in existing redirects # that would ISO-4 abbreviate to it potentials = [t for (a, t) in potentialAbbrevs if abbrevUtils.isSoftMatch(rTitleDotless, a)] potentials = list(sorted(set(potentials))) # Find closest computed abbrev. bestAbbrev = '' bestDist = len(rTitle) for computedAbbrev in sorted(requiredRedirects): dist = Levenshtein.distance(rTitle, computedAbbrev) if dist < bestDist: bestDist = dist bestAbbrev = computedAbbrev # Skip if closest abbrev. is far (assume it's from a former # title, since there's a ton of cases like that). if bestDist <= 8: reports.reportSuperfluousRedirect( title, rTitle, rContent, bestAbbrev, potentials) return nEditedPages
def fixRedirectAnchor(rTitle: str, anchor: str, target: str) -> bool: """Add an anchor to given redirect page.""" rPage = pywikibot.Page(Site(), rTitle) addJournal = False if rPage.exists() and not rPage.isRedirectPage(): addJournal = True if 'journal' in rTitle.lower(): print(f'Skip: [[{rTitle}]] already exists, ' 'title already has "journal".', flush=True) return False for cat in rPage.categories(): if 'journal' in cat.title().lower(): print(f'Skip: [[{rTitle}]] already exists, ' 'has category containing "journal".', flush=True) return False if addJournal: rPage = pywikibot.Page(Site(), rTitle + ' (journal)') if not rPage.exists() or not rPage.isRedirectPage(): print(f'Not exists/not a redirect: [[{rPage.title()}]]', flush=True) return False # Page.title() actually contains anchor, if redirect had one. actualTarget = rPage.getRedirectTarget().title().split('#', 1) if actualTarget[0] != target: print(f'Not a redirect to this list: ' f'[[{rPage.title()}]] -> [[{actualTarget[0]}]]', flush=True) return False if len(actualTarget) > 1: if actualTarget[1] != anchor: print(f'WARNING: Anchor mismatch: ' f'[[{rPage.title()}]] -> [[{actualTarget[0]}]].' f'Is "{actualTarget[1]}" should be "{anchor}".') return False else: return True predictedAnchor = getPredictedAnchor(rTitle) if predictedAnchor != anchor: print(f'WARNING: Anchor mismatch: ' f'[[{rPage.title()}]] -> [[{actualTarget[0]}]].' f'Predicted "{predictedAnchor}" should be "{anchor}".') return False rText = rPage.text rNewText = re.sub(r'''( \#\s*REDIRECT\s*\[\[ [^\]\#]+ # title ) (\#[^\]]*)? # anchor \]\]''', '\\1#' + anchor + ']]', rText, count=1, flags=re.VERBOSE) if rText == rNewText: print(f'Nothing to do on: [[{rPage.title()}]]') return True print(f'===CHANGING [[{rPage.title()}]] FROM==================') print(rText) print('==========TO===========') print(rNewText + '\n\n', flush=True) trySaving(rPage, rNewText, 'Add anchor to redirect, as it points to a long list.', overwrite=True) return True
def createOrFixOmicsRedirect(title: str, rType: str, config: Config, tryOnly: bool) -> str: """Attempt to create or fix redirect from [[title]] to [[target]]. We return 'create' if non-existing, 'done' if basically equal to what we would add, 'fix' if exists but looks fixable, 'unfixable' otherwise. Also create talk page with {{WPJournals}} when non-existing. """ rText = '#REDIRECT[[' + config.rTarget + ']]\n' rCat = '[[Category:' + config.rCat + ']]\n' if config.rCat else '' rIsoCat = '{{R from ISO 4}}\n' rSortTitle = title if rSortTitle.startswith('The ') and '(' not in title: rSortTitle = rSortTitle.replace('The ', '') + ', The' if ' & ' in rSortTitle: rSortTitle = rSortTitle.replace(' & ', ' and ') if rSortTitle != title: rSort = '{{DEFAULTSORT:' + rSortTitle + '}}\n' if config.anchor: rText = '#REDIRECT[[' + config.rTarget + '#' + rSortTitle[0] + ']]\n' rNewContent = rText if rSortTitle != title: rNewContent += rSort if rType == 'plain': rNewContent += rCat if rType == 'iso4': rNewContent += '{{R from ISO 4}}\n' rPage = pywikibot.Page(Site(), title) rTalkPage = rPage.toggleTalkPage() if not rPage.exists(): if rType == 'uniso4': return 'ignore' if not tryOnly: print(f'Creating redirect from: [[{title}]].') trySaving(rPage, rNewContent, 'Create redirect from journal to publisher.', overwrite=False, limitType='create') if rType == 'plain' and not rTalkPage.exists(): content = '{{WPJournals|class=redirect}}' trySaving(rTalkPage, content, 'Mark new redirect into {{WPJournals}}.', overwrite=False, limitType='talk') return 'create' # If rPage exists, check if we would add basically the same. text = rPage.text textStripped = re.sub(r'\s', '', text, re.M).strip() rNewStripped = re.sub(r'\s', '', rNewContent, re.M).strip() if textStripped == rNewStripped: if not tryOnly: if rTalkPage.exists(): print(f'Done: [[{title}]].') elif rType == 'plain': print(f'Done, but creating talk page: [[{title}]].') content = '{{WPJournals|class=redirect}}' trySaving(rTalkPage, content, 'Mark redirect into {{WPJournals}}.', overwrite=False, limitType='talk') return 'done' # If rPage exists but not the same, check if it is a fixable case. if rCat: text = text.replace(rCat.strip(), '') text = text.replace(rIsoCat.strip(), '') text = re.sub(r'\{\{DEFAULTSORT:[^\}]*\}\}', '', text) # Strip link anchors and whitespace before comparing regex = r'(' + re.escape(config.rTarget) + r')\#.' textStripped = re.sub(regex, r'\1', text, re.M) textStripped = re.sub(r'\s', '', textStripped, re.M).strip() rTextStripped = re.sub(regex, r'\1', rText, re.M) rTextStripped = re.sub(r'\s', '', rTextStripped, re.M).strip() if textStripped != rTextStripped: print(f'Not fixable: [[{title}]] (type={rType}).') print('---IS-------------') print(rPage.text) print('---SHOULD BE------') print(rNewContent) print('==================') return 'unfixable' # If it is fixable, fix it. if not tryOnly: if rType == 'uniso4': print(f'Removing iso4 tag from: [[{title}]].') print(f'Fixing redirect from: [[{title}]] (type={rType}).') print('---WAS------------') print(rPage.text) print('---WILL BE--------') print(rNewContent) print('==================') trySaving(rPage, rNewContent, 'Fix redirect from journal to publisher.', overwrite=True, limitType='fix') if rType == 'plain' and not rTalkPage.exists(): content = '{{WPJournals|class=redirect}}' trySaving(rTalkPage, content, 'Fix redirect from journal to publisher.', overwrite=False, limitType='talk') return 'fix'