def __main__(): def get_content(): try: return open(FILE) except FileNotFoundError as e: resp = requests.get(URL) f = open(FILE, 'wb') f.write(resp.content) print(resp.content) f.close() return resp.content return None c = get_content() soup = bs4.BeautifulSoup(get_content(), "lxml") lis = soup.select('ul#container > li') events = [li_to_event(li) for li in lis] c = UnscrollClient() # W3C Web Standards TITLE = 'W3C Web Standards' favthumb = c.cache_thumbnail( 'https://2.bp.blogspot.com/-70GFD8HsG3I/VMKLC7IoiBI/AAAAAAAAIck/GCu0LIY3PCU/s1600/Logo%2BW3C.png' ) c.delete_scroll_with_title(TITLE) c.__batch__(scroll_title=TITLE, thumbnail=favthumb['url'], events=events) print(len(events))
def create(newsgroup, dir, maxyear): _title = '{}'.format(newsgroup) api = UnscrollClient() api.delete_scroll_with_title(_title) favthumb = api.cache_thumbnail(THUMBNAIL_URL) with_thumbnail = favthumb.get('url') scroll = api.create_or_retrieve_scroll( _title, description='Usenet message board archives', link='https://archive.org/details/usenethistorical', with_thumbnail=favthumb['url'], subtitle='Collection via Usenet Historical Collection', ) newsgroup_to_events(newsgroup, scroll, api, dir, maxyear)
def load_data(begin=None, end=None, title=None, slug=None, thumbnail_url=None, delete=False): # Get the file listing _link = 'https://archive.org/details/{}'.format(slug) _r = requests.get('https://archive.org/metadata/{}'.format(slug)) _data = _r.json() # Get metadata _md = _data.get('metadata') _title = title if (title is None): _title = _md.get('title') _description = _md.get('description') _events = extract_events(_data, 'MP3', _link, begin, end) _thumbnail_url = thumbnail_url if (thumbnail_url is None): _thumbnail_url = extract_thumbnail(_data) api = UnscrollClient() _thumb = api.cache_thumbnail(_thumbnail_url) _with_thumbnail = _thumb.get('url') if delete is True: api.delete_scroll_with_title(_title) print('XXXXXXX{}'.format(_title)) scroll = api.create_or_retrieve_scroll( _title, subtitle='via Archive.org', public=True, description=_description, link=_link, citation='', with_thumbnail=_with_thumbnail, ) for event in _events: pprint.pprint(event) j = api.create_event(event, scroll) pprint.pprint(j.json())
def __main__(): c = UnscrollClient() c.login() c.delete_scroll_with_title('Amazon PR') thumbnail = 'http://media.corporate-ir.net/media_files/IROL/17/176060/img/logos/amazon_logo_RGB.jpg' favthumb = c.cache_thumbnail(thumbnail) scroll = c.create_or_retrieve_scroll('Amazon PR', description='A set of press releases from the Amazon Press Room.', link='http://phx.corporate-ir.net/phoenix.zhtml?c=176060&p=irol-news&nyo=0', citation='Amazon Press Room', with_thumbnail=favthumb.get('url')) print(scroll) get_releases(c, scroll)
def __main__(): events = [] title = 'IETF RFCs' c = UnscrollClient() c.delete_scroll_with_title('IETF RFCs') favthumb = c.cache_thumbnail( 'https://ietf.org/media/images/ietf-logo.original.png') # Load RFCs read = '' with open('cache/rfc/rfc-index.xml', 'r') as f: read = f.read() parsed = xmltodict.parse(read) docs = parsed['rfc-index']['rfc-entry'] events = [rfc_to_event(x) for x in docs] # Do it scroll = c.__batch__(scroll_title=title, thumbnail=favthumb['url'], events=events) print(len(events))
def __main__(): scroll_thumb = "https://upload.wikimedia.org/wikipedia/commons/0/0b/Studs_Terkel_-_1979-1.jpg" api = UnscrollClient() title = "Studs Terkel Interviews" favthumb = api.cache_thumbnail(scroll_thumb) with_thumbnail = favthumb.get('url') api.delete_scroll_with_title(title) scroll = api.create_or_retrieve_scroll( title, description='<b>Via the Studs Terkel Radio Archive at WFMT</b>: ' 'In his 45 years on WFMT radio, Studs Terkel talked to the 20th ' 'century’s most interesting people.', link='https://studsterkel.wfmt.com/', with_thumbnail=with_thumbnail, subtitle='Collection via WFMT', ) post_shows(api, scroll)
from bs4 import BeautifulSoup import requests from pprint import pprint from unscroll import UnscrollClient import datefinder from random import random ADOBE_URL = "http://news.adobe.com/views/ajax?js=1&page={}&view_name=bw_press_release&view_display_id=panel_pane_7&view_args=all%2Fall&view_path=news&view_base_path=null&view_dom_id=1&pager_element=0" c = UnscrollClient(api='http://127.0.0.1', username='******', password='******') c.login() favicon_url = c.fetch_favicon_url('https://www.adobe.com') favthumb = c.cache_thumbnail(favicon_url['url']) c.create_or_retrieve_scroll('Adobe PR', thumbnail=favthumb['url']) for i in range(1, 92): pr_url = ADOBE_URL.format(i, ) r = requests.get(pr_url) r_as_data = r.json() r_html = r_as_data['display'] parsed = BeautifulSoup(r_html, 'html.parser') els = parsed.find_all('div', class_='view-inner-wrapper') events = [] for el in els: date_source = el.find('div', class_='views-field-created') date_source_txt = date_source.text
from unscroll import UnscrollClient from dateparser import parse import datefinder from random import random import re APPLE_URL = 'https://www.apple.com' APPLE_PR_URL = 'https://www.apple.com/pr/library' c = UnscrollClient(api='http://127.0.0.1', username='******', password='******') c.login() favicon_url = c.fetch_favicon_url(APPLE_URL) favthumb = c.cache_thumbnail(favicon_url['url']) print(favthumb) c.create_or_retrieve_scroll('Apple Press Releases, 2000-2017', thumbnail=favthumb['url']) for i in range(1, 66): pr_url = 'https://www.apple.com/newsroom/archive/?page={}'.format(i, ) print(pr_url) r = requests.get(pr_url) parsed = BeautifulSoup(r.content, 'html.parser') dts = parsed.find_all('a', class_='result__item') events = [] for dt in dts: title = dt.find('h3').text text = dt.find('p').text
class WikipediaText(): year = None events = [] subject = None parsed = None unscroll_client = None scroll = None def __init__(self, year=None, subject=None): self.year = year self.subject = subject self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year) if subject is not None: self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format( year, subject) r = requests.get(self.wiki_url) self.parsed = BeautifulSoup(r.content, 'html.parser') self.unscroll_client = UnscrollClient() self.unscroll_client.login() favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL) subject_title = subject if subject is None: subject_title = 'Review' self.scroll = self.unscroll_client.create_or_retrieve_scroll( 'Wiki Years in {}'.format(subject_title), description='Events spidered from the English Wikipedia pages.', link='https://en.wikipedia.org/wiki/List_of_years', with_thumbnail=favthumb.get('url')) def tidy(self, txt=None): return re.sub('\[edit\]\s*', '', txt) def realday(self, monthname=None, day=None): month = MONTHS_HASH[monthname] day = int(day) return date(self.year, month, day) def wikihtml_to_event(self, date=None, wikihtml=None, kind=None): sup = wikihtml.find('sup') if sup is not None: _ = sup.extract() contents = [str(x) for x in wikihtml.children] joined = "".join(contents) linked = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', joined) targeted = re.sub(r'href=', 'target="_blank" href=', linked) bleached = bleach.clean(targeted, tags=['b', 'i', 'strong', 'em'], strip=True) pass1 = re.sub(MONTHS_PREFIX, '', bleached) pass2 = re.sub(MONTHS_PREFIX, '', pass1) lastpass = re.sub('^\s*\d+\s*[-–—]\s*', '', pass2) titles = [ x['title'] for x in wikihtml.find_all('a') if x.has_attr('title') ] filtered = [x for x in titles if not MONTH_REGEX.match(x)] title = None subject = None if len(filtered) == 0: title = " ".join(bleached.split(" ")[0:4]) + '...' else: title = filtered[0] subject = title thumbnail = None if subject is not None: image_d = self.unscroll_client.fetch_wiki_thumbnail_data( title=subject) image_url = image_d.get('url') if image_d is not None else None if image_url is not None: thumbnail_local = self.unscroll_client.cache_local(image_url) thumbnail_d = self.unscroll_client.post_thumbnail( thumbnail_local) if thumbnail_d is not None: thumbnail = thumbnail_d['url'] if kind == 'birth': lastpass = '******'.format(lastpass) elif kind == 'death': lastpass = '******'.format(lastpass) ranking = 0 if kind == 'world event': ranking = 0.9 if kind == 'birth': ranking = 0.1 if kind == 'death': ranking = 0.5 dt = datetime.combine(date, datetime.max.time()).isoformat(' ') wiki_subject = None if subject is not None: subject = re.sub(r'\s', '_', subject) wiki_subject = 'https://en.wikipedia.org/wiki/{}'.format(subject, ) event = { 'title': lastpass, 'text': None, 'resolution': 10, 'ranking': ranking, 'when_happened': dt, 'when_original': None, 'with_thumbnail': thumbnail, 'content_url': wiki_subject, 'source_url': self.wiki_url, 'source_name': 'Wikipedia Event Pages', 'content_type': kind } e = self.unscroll_client.create_event(event, self.scroll) pprint.pprint(e.json()) return event def descend(self, ul=None, kind=None): last_date = None events = [] for d in ul: if d.name == 'ul': pass elif d.name == 'li': t = re.findall(MONTHS_DAYS, d.text) if len(t) > 0: last_date = t[0] if not (d.find('ul')): date = self.realday(monthname=last_date[0], day=last_date[1]) e = self.wikihtml_to_event(date=date, wikihtml=d, kind=kind) # print("A: {}\n".format(e.get('title'))) events.append(e) elif last_date is not None: date = self.realday(last_date[0], last_date[1]) e = self.wikihtml_to_event(date=date, wikihtml=d, kind=kind) # print("B: {}\n".format(e.get('title'))) events.append(e) if len(events) > 0: return events def get_events(self): event_types = { '#Events': 'world event', '#Births': 'birth', '#Deaths': 'death' } events = [] for keytype in event_types: try: events_h2 = self.parsed.select(keytype)[0].parent for event in events_h2.next_siblings: if event.name == "h2": break else: if event.name == "h3": pass if event.name == 'ul': es = self.descend(ul=event.descendants, kind=event_types[keytype]) if es is not None: events += es except IndexError: print('No {}'.format(keytype, )) return events
'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945)', 'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945%E2%80%931991)', 'https://en.wikipedia.org/wiki/Timeline_of_the_Manhattan_Project' ] api = UnscrollClient() scroll = api.create_or_retrieve_scroll('WWII audio') for page in pages: items = extract_list('', page) for item in items: if item is not None: thumb_url = None wiki_thumb = api.fetch_wiki_thumbnail_data(item.get('item')) if wiki_thumb is not None: thumb = api.cache_thumbnail(wiki_thumb.get('url')) if thumb is not None: thumb_url = thumb.get('url') content_url = item.get('content_url') if content_url is None: content_url = page event = { 'title': item.get('title'), 'text': item.get('text'), 'content_url': content_url, 'source_url': page, 'with_thumbnail': thumb_url, 'when_happened': item.get('when_happened'), 'when_original': item.get('when_original'),
from unscroll import UnscrollClient c = UnscrollClient() p = c.cache_thumbnail( 'https://upload.wikimedia.org/wikipedia/commons/b/b2/Donnchadh_mac_Gille-Brighdhe_Seal.jpg' ) print(p) p2 = c.fetch_wiki_thumbnail('George_Orwell') print(p2)
def __main__(): c = UnscrollClient() c.login() favthumb = c.cache_thumbnail(THUMBNAIL_IMAGE) scroll = c.create_or_retrieve_scroll( 'Cooper-Hewitt', description='Items from the Cooper Hewitt', link='https://github.com/cooperhewitt/collection', citation='Cooper-Hewitt Museum Collection', with_thumbnail=favthumb.get('url')) conn = sqlite3.connect('/home/unscroll/cache/cooper/objects.db') conn.row_factory = sqlite3.Row sqlc = conn.cursor() i = 0 sqlc.execute("SELECT * FROM objects LIMIT -1 OFFSET {}".format(i)) for row in sqlc.fetchall(): if row['primary_image'] is not None and row['date'] is not None: # switch to the 300x300 thumbnail sq = re.sub('z\.jpg', 'sq.jpg', row['primary_image']) local_sq = re.sub(r'https?://', '', sq) local = '/home/unscroll/cache/cooper/{}'.format(local_sq, ) i = i + 1 found = False try: f = open(local, 'r') f.close() found = True except FileNotFoundError as e: try: r = requests.get(sq) p = pathlib.Path(local) p.parent.mkdir(parents=True, exist_ok=True) f = open(local, 'wb') f.write(r.content) f.close() found = True except ConnectionError as e: print('[cooperhewitt2.py] ConnectionError: {}'.format(e, )) print('{}: {}/{}'.format(i, local, found)) ud = UnscrollDate(row['date'], begin=-4000, end=2018) if ud.is_okay(): with_thumbnail = None if found: thumb = c.post_thumbnail(local) if thumb is not None: with_thumbnail = thumb.get('url') d = { 'title': row['title'], 'text': row['description'], 'resolution': ud.resolution, 'ranking': 0, 'content_url': 'https://collection.cooperhewitt.org/objects/{}/'.format( row['id'], ), 'with_thumbnail': with_thumbnail, 'source_name': 'Collection Data for Cooper Hewitt, Smithsonian Design Museum', 'source_url': 'https://github.com/cooperhewitt/collection', 'when_happened': ud.when_happened, 'when_original': ud.when_original } e = c.create_event(d, scroll)