def get_url_filename(url=None, driveid=None): url = url or 'https://drive.google.com/open?id={}'.format(driveid) if url.startswith('https://drive.google.com'): filename = get_url_title(url) if filename.endswith('Google Drive'): filename = filename[:-len('Google Drive')].rstrip().rstrip('-:').rstrip() return filename
def infer_url_title(url): """ Guess what the page title is going to be from the path and FQDN in the URL >>> infer_url_title('https://ai.googleblog.com/2018/09/the-what-if-tool-code-free-probing-of.html') 'the what if tool code free probing of' """ meta = get_url_filemeta(url) title = '' if meta: if meta.get('hostname', url) == 'drive.google.com': title = get_url_title(url) else: title = meta.get('filename', meta['hostname']) or meta['hostname'] title, fileext = splitext(title) else: logging.error('Unable to retrieve URL: {}'.format(url)) return None return delimit_slug(title, ' ')
def translate_line_footnotes(line, tag=None, default_title='<NOT_FOUND>'): r""" Find all bare-url footnotes, like "footnote:[moz.org]" and add a title like "footnote:[Moz (moz.org)]" >>> translate_line_footnotes('*Morphemes*:: Parts of tokens or words that contain meaning in and of themselves.'\ ... 'footnote:[https://spacy.io/usage/linguistic-features#rule-based-morphology]') '*Morphemes*:: Parts of tokens or words that contain meaning in and of themselves.footnote:[See the web page titled "Linguistic Features : spaCy Usage Documentation" (https://spacy.io/usage/linguistic-features#rule-based-morphology).]' """ line_urls = get_line_bad_footnotes(line, tag=tag) urls = line_urls[1:] if line_urls else [] for url in urls: footnote = 'footnote:[{url}]'.format(url=url) new_footnote = footnote # TODO: use these to extract name from hyperlinks title = get_url_title(url) title = title or infer_url_title(url) title = (title or '').strip(' \t\n\r\f-_:|="\'/\\') title = title if ' ' in (title or 'X') else None if title: brief_title = title.split('\n')[0].strip().split( '|')[0].strip().split('Â')[0].strip().split('·')[0].strip() logging.info('URL: {}'.format(url)) logging.info('TITLE: {}'.format(title)) title = brief_title if len(brief_title) > 3 and len( title) > 55 else title title = title.replace('Â', '').replace('·', ':').replace( '|', ':').replace('\n', '--') logging.info('FINAL: {}'.format(title)) title = title or default_title if title: new_footnote = 'footnote:[See the web page titled "{title}" ({url}).]'.format( title=(title or default_title), url=url) elif title is None: logging.error('Unable to find a title for url: {}'.format(url)) else: new_footnote = 'footnote:[See the web page ({url}).]'.format( url=url) line = line.replace(footnote, new_footnote) return line