def parse_pinboard_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" json_file.seek(0) json_content = json.load(json_file) for line in json_content: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line if erg.get('timestamp'): timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp elif erg.get('time'): timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp()) elif erg.get('created_at'): timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp()) else: timestamp = str(datetime.now().timestamp()) if erg.get('href'): url = erg['href'] else: url = erg['url'] if erg.get('description'): title = (erg.get('description') or '').replace(' — Readability', '') else: title = erg['title'].strip() info = { 'url': url, 'timestamp': timestamp, 'tags': erg.get('tags') or '', 'title': title or None, 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info
def basic_link_info(url, f, title=None, time=datetime.now(), tags=""): info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, 'title': title, 'sources': [f.name], } info['type'] = get_link_type(info) return info
def parse_atom_export(rss_file): """Parse Atom XML-format files into links""" rss_file.seek(0) rss_data = rss_file.read() d = feedparser.parse(rss_data) for item in d.entries: info = { 'url': item.link, 'domain': domain(item.link), 'base_url': base_url(item.link), 'timestamp': str(mktime(item.published_parsed)), 'tags': '', 'title': item.title, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_plain_text_export(text_file): """Parse raw links from each line in a text file""" text_file.seek(0) text_content = text_file.readlines() for line in text_content: if line: urls = re.findall(URL_REGEX, line) for url in urls: url = url.strip() info = { 'url': url, 'timestamp': str(datetime.now().timestamp()), 'tags': '', 'title': None, 'sources': [text_file.name], } info['type'] = get_link_type(info) yield info
def parse_rss_export(rss_file): """Parse RSS XML-format files into links""" rss_file.seek(0) items = rss_file.read().split('</item>\n<item>') for item in items: # example item: # <item> # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title> # <category>Unread</category> # <link>https://blog.sessionstack.com/how-javascript-works-inside</link> # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid> # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate> # </item> trailing_removed = item.split('</item>', 1)[0] leading_removed = trailing_removed.split('<item>', 1)[-1] rows = leading_removed.split('\n') def get_row(key): return [ r for r in rows if r.strip().startswith('<{}>'.format(key)) ][0] title = str_between(get_row('title'), '<![CDATA[', ']]').strip() url = str_between(get_row('link'), '<link>', '</link>') ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_plain_text(text_file): """Parse raw links from each line in a text file""" text_file.seek(0) text_content = text_file.readlines() for line in text_content: if line: urls = re.findall(URL_REGEX, line) for url in urls: info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(datetime.now().timestamp()), 'tags': '', 'title': fetch_page_title(url), 'sources': [text_file.name], } info['type'] = get_link_type(info) yield info
def parse_medium_rss_export(rss_file): """Parse Medium RSS feed files into links""" rss_file.seek(0) root = etree.parse(rss_file).getroot() items = root.find("channel").findall("item") for item in items: # for child in item: # print(child.tag, child.text) url = item.find("link").text title = item.find("title").text.strip() ts_str = item.find("pubDate").text time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") info = { 'url': url, 'timestamp': str(time.timestamp()), 'tags': '', 'title': title or None, 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info
def parse_json_export(json_file): """Parse JSON-format bookmarks export files (produced by pinboard.in/export/)""" json_file.seek(0) json_content = json.load(json_file) for line in json_content: # example line # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ') info = { 'url': erg['href'], 'domain': domain(erg['href']), 'base_url': base_url(erg['href']), 'timestamp': str(time.timestamp()), 'tags': erg['tags'], 'title': erg['description'].replace(' — Readability', ''), 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info