def load_data(begin=None, end=None, title=None, slug=None, thumbnail_url=None, delete=False): # Get the file listing _link = 'https://archive.org/details/{}'.format(slug) _r = requests.get('https://archive.org/metadata/{}'.format(slug)) _data = _r.json() # Get metadata _md = _data.get('metadata') _title = title if (title is None): _title = _md.get('title') _description = _md.get('description') _events = extract_events(_data, 'MP3', _link, begin, end) _thumbnail_url = thumbnail_url if (thumbnail_url is None): _thumbnail_url = extract_thumbnail(_data) api = UnscrollClient() _thumb = api.cache_thumbnail(_thumbnail_url) _with_thumbnail = _thumb.get('url') if delete is True: api.delete_scroll_with_title(_title) print('XXXXXXX{}'.format(_title)) scroll = api.create_or_retrieve_scroll( _title, subtitle='via Archive.org', public=True, description=_description, link=_link, citation='', with_thumbnail=_with_thumbnail, ) for event in _events: pprint.pprint(event) j = api.create_event(event, scroll) pprint.pprint(j.json())
class WikipediaText(): year = None events = [] subject = None parsed = None unscroll_client = None scroll = None def __init__(self, year=None, subject=None): self.year = year self.subject = subject self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year) if subject is not None: self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format( year, subject) r = requests.get(self.wiki_url) self.parsed = BeautifulSoup(r.content, 'html.parser') self.unscroll_client = UnscrollClient() self.unscroll_client.login() favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL) subject_title = subject if subject is None: subject_title = 'Review' self.scroll = self.unscroll_client.create_or_retrieve_scroll( 'Wiki Years in {}'.format(subject_title), description='Events spidered from the English Wikipedia pages.', link='https://en.wikipedia.org/wiki/List_of_years', with_thumbnail=favthumb.get('url')) def tidy(self, txt=None): return re.sub('\[edit\]\s*', '', txt) def realday(self, monthname=None, day=None): month = MONTHS_HASH[monthname] day = int(day) return date(self.year, month, day) def wikihtml_to_event(self, date=None, wikihtml=None, kind=None): sup = wikihtml.find('sup') if sup is not None: _ = sup.extract() contents = [str(x) for x in wikihtml.children] joined = "".join(contents) linked = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', joined) targeted = re.sub(r'href=', 'target="_blank" href=', linked) bleached = bleach.clean(targeted, tags=['b', 'i', 'strong', 'em'], strip=True) pass1 = re.sub(MONTHS_PREFIX, '', bleached) pass2 = re.sub(MONTHS_PREFIX, '', pass1) lastpass = re.sub('^\s*\d+\s*[-–—]\s*', '', pass2) titles = [ x['title'] for x in wikihtml.find_all('a') if x.has_attr('title') ] filtered = [x for x in titles if not MONTH_REGEX.match(x)] title = None subject = None if len(filtered) == 0: title = " ".join(bleached.split(" ")[0:4]) + '...' else: title = filtered[0] subject = title thumbnail = None if subject is not None: image_d = self.unscroll_client.fetch_wiki_thumbnail_data( title=subject) image_url = image_d.get('url') if image_d is not None else None if image_url is not None: thumbnail_local = self.unscroll_client.cache_local(image_url) thumbnail_d = self.unscroll_client.post_thumbnail( thumbnail_local) if thumbnail_d is not None: thumbnail = thumbnail_d['url'] if kind == 'birth': lastpass = '******'.format(lastpass) elif kind == 'death': lastpass = '******'.format(lastpass) ranking = 0 if kind == 'world event': ranking = 0.9 if kind == 'birth': ranking = 0.1 if kind == 'death': ranking = 0.5 dt = datetime.combine(date, datetime.max.time()).isoformat(' ') wiki_subject = None if subject is not None: subject = re.sub(r'\s', '_', subject) wiki_subject = 'https://en.wikipedia.org/wiki/{}'.format(subject, ) event = { 'title': lastpass, 'text': None, 'resolution': 10, 'ranking': ranking, 'when_happened': dt, 'when_original': None, 'with_thumbnail': thumbnail, 'content_url': wiki_subject, 'source_url': self.wiki_url, 'source_name': 'Wikipedia Event Pages', 'content_type': kind } e = self.unscroll_client.create_event(event, self.scroll) pprint.pprint(e.json()) return event def descend(self, ul=None, kind=None): last_date = None events = [] for d in ul: if d.name == 'ul': pass elif d.name == 'li': t = re.findall(MONTHS_DAYS, d.text) if len(t) > 0: last_date = t[0] if not (d.find('ul')): date = self.realday(monthname=last_date[0], day=last_date[1]) e = self.wikihtml_to_event(date=date, wikihtml=d, kind=kind) # print("A: {}\n".format(e.get('title'))) events.append(e) elif last_date is not None: date = self.realday(last_date[0], last_date[1]) e = self.wikihtml_to_event(date=date, wikihtml=d, kind=kind) # print("B: {}\n".format(e.get('title'))) events.append(e) if len(events) > 0: return events def get_events(self): event_types = { '#Events': 'world event', '#Births': 'birth', '#Deaths': 'death' } events = [] for keytype in event_types: try: events_h2 = self.parsed.select(keytype)[0].parent for event in events_h2.next_siblings: if event.name == "h2": break else: if event.name == "h3": pass if event.name == 'ul': es = self.descend(ul=event.descendants, kind=event_types[keytype]) if es is not None: events += es except IndexError: print('No {}'.format(keytype, )) return events
def save_met(): c = UnscrollClient() c.login() c.delete_scroll_with_title('The Met') scroll = c.create_or_retrieve_scroll('The Met') s = requests.Session() conn = sqlite3.connect('/home/unscroll/cache/met.db') conn.row_factory = sqlite3.Row sqlc = conn.cursor() sqlc.execute("SELECT * FROM collection LIMIT -1 OFFSET 0") for row in sqlc.fetchall(): ud = UnscrollDate(row['date'], begin=-2000, end=2018) if ud.is_okay(): with_thumbnail = None found = False img = row['image'] local_img = re.sub(r'https?://images.metmuseum.org/', '', img) medium = '' if 'medium' in row: medium = ' ({})'.format(row['medium']) if img is not None and row['date'] is not None: local = '/home/unscroll/cache/met-images/{}'.format( local_img, ) if file_exists(local): thumb = c.post_thumbnail(local) if thumb is not None: with_thumbnail = thumb.get('url') d = { 'title': row['title'] + medium, 'text': row['description'], 'resolution': ud.resolution, 'ranking': 0, 'content_url': 'https://www.metmuseum.org{}'.format(row['url'], ), 'with_thumbnail': with_thumbnail, 'source_name': 'The Met', 'source_url': 'https://www.metmuseum.org/', 'when_happened': ud.when_happened, 'when_original': ud.when_original } e = c.create_event(d, scroll) print(e)
scroll = api.create_or_retrieve_scroll('WWII audio') for page in pages: items = extract_list('', page) for item in items: if item is not None: thumb_url = None wiki_thumb = api.fetch_wiki_thumbnail_data(item.get('item')) if wiki_thumb is not None: thumb = api.cache_thumbnail(wiki_thumb.get('url')) if thumb is not None: thumb_url = thumb.get('url') content_url = item.get('content_url') if content_url is None: content_url = page event = { 'title': item.get('title'), 'text': item.get('text'), 'content_url': content_url, 'source_url': page, 'with_thumbnail': thumb_url, 'when_happened': item.get('when_happened'), 'when_original': item.get('when_original'), 'resolution': item.get('resolution'), } res = api.create_event(event, scroll) if res.status_code != 200: print(res.json())
def __main__(): c = UnscrollClient() c.login() favthumb = c.cache_thumbnail(THUMBNAIL_IMAGE) scroll = c.create_or_retrieve_scroll( 'Cooper-Hewitt', description='Items from the Cooper Hewitt', link='https://github.com/cooperhewitt/collection', citation='Cooper-Hewitt Museum Collection', with_thumbnail=favthumb.get('url')) conn = sqlite3.connect('/home/unscroll/cache/cooper/objects.db') conn.row_factory = sqlite3.Row sqlc = conn.cursor() i = 0 sqlc.execute("SELECT * FROM objects LIMIT -1 OFFSET {}".format(i)) for row in sqlc.fetchall(): if row['primary_image'] is not None and row['date'] is not None: # switch to the 300x300 thumbnail sq = re.sub('z\.jpg', 'sq.jpg', row['primary_image']) local_sq = re.sub(r'https?://', '', sq) local = '/home/unscroll/cache/cooper/{}'.format(local_sq, ) i = i + 1 found = False try: f = open(local, 'r') f.close() found = True except FileNotFoundError as e: try: r = requests.get(sq) p = pathlib.Path(local) p.parent.mkdir(parents=True, exist_ok=True) f = open(local, 'wb') f.write(r.content) f.close() found = True except ConnectionError as e: print('[cooperhewitt2.py] ConnectionError: {}'.format(e, )) print('{}: {}/{}'.format(i, local, found)) ud = UnscrollDate(row['date'], begin=-4000, end=2018) if ud.is_okay(): with_thumbnail = None if found: thumb = c.post_thumbnail(local) if thumb is not None: with_thumbnail = thumb.get('url') d = { 'title': row['title'], 'text': row['description'], 'resolution': ud.resolution, 'ranking': 0, 'content_url': 'https://collection.cooperhewitt.org/objects/{}/'.format( row['id'], ), 'with_thumbnail': with_thumbnail, 'source_name': 'Collection Data for Cooper Hewitt, Smithsonian Design Museum', 'source_url': 'https://github.com/cooperhewitt/collection', 'when_happened': ud.when_happened, 'when_original': ud.when_original } e = c.create_event(d, scroll)