def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"presentation")]/h3/a' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for a in entries: title = a.text if 'canceled' in title.lower(): continue root = html.fromstring( requests.get('https://us.pycon.org' + a.get('href')).text) speakers = root.xpath('//h4/a/text()') abstract = root.xpath('//div[@class="abstract"]')[0].text_content() try: level = root.xpath('//dl/dd/text()')[0] except ValueError: continue level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: print('skipping...') continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): if 'Jasmine Hsu' in title: speakers = title.split(',') title = 'Fire, bullets, and productivity' level = 'Beginner' else: speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') speakers = [s for s in speakers if s.strip() and not '?' in s] level = td.xpath('./comment()')[0].text.splitlines()[1].strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
def add_presentation(url, category): print("Collecting from {}".format(url)) xpath = '//div[contains(@class,"box-content")]/*' entries = html.fromstring(requests.get(url).text).xpath(xpath) first = next(i for i, e in enumerate(entries) if e.tag == 'h2') ## Iterate through and extract the relevant content for i in range(int((len(entries) - first) / 3)): h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)] title = h2.text_content() if 'canceled' in title.lower(): continue speakers = p.text_content().strip('\n ').split('\n', 1)[0].split(',') speakers = [s for s in speakers if s.strip() and not '?' in s] abstract = div.text_content().strip() talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category): print("Collecting from {}".format(url)) xpath = '//td[contains(@class,"slot")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content for td in entries: a = td.find('./span[@class="title"]/a') if a is None: continue title = a.text abstract = a.get('title') if 'canceled' not in title.lower(): speakers = td.findtext('./span[@class="speaker"]').strip() speakers = speakers.split( ',') if ',' in speakers else speakers.split('&') level = td.findtext('./span[@class="audience_level"]').strip() level = 'Beginner' if level == 'Novice' else level talk = Talk(category=category, conference_id=conference.id, title=title) data = db.TalkData(speakers, [], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict())
db.session.commit() ## Talks keynotes = ( (['Guido van Rossum'], 'Update on the state of Python', None), (['Steve Huffman', 'Alexis Ohanian'], 'Reddit', "Reddit's origin and the switch to Python") ) for speaker_names, title, abstract in keynotes: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) talk.title = title if title == 'Reddit': data.organization_names.append('Reddit') if abstract: talk.abstract = abstract data = db.TalkData(speaker_names, [], []) db.add_talk(talk, **data._asdict()) ## Tutorials ## ==> Ignore these...the links are broken and only the presenters' ## last names are given, so it is hard to create an entry. ## #wayback = 'https://web.archive.org/web/20090518174359/' #url = wayback + 'http://us.pycon.org:80/2009/tutorials/schedule' #xpath = '//div[@id="tutorials"]//li' #entries = html.fromstring(requests.get(url).text).xpath(xpath) ## Iterate through and extract the relevant content #for e in entries: # tmp = e.text_content()
xpath = '//div[contains(@class,"presentation")]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('tutorials') print(url) ## Iterate through and extract the relevant content for e in entries: a = e.find('./h3/a') title = a.text root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text) abstract = root.xpath('//div[@class="abstract"]')[0].text_content() speakers = root.xpath('//h4/a/text()') level, category = root.xpath('//dl/dd/text()') level = 'Beginner' if level == 'Novice' else level talk = Talk(category=Talk.TUTORIAL, conference_id=conference.id, title=title) data = db.TalkData(speakers, [category], []) talk.abstract = abstract[:10000] talk.level = level db.add_talk(talk, **data._asdict()) url = 'https://us.pycon.org/2013/schedule/talks/list/' xpath = '//div[contains(@class,"presentation")]/h3/a' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) # Iterate through and extract the relevant content for a in entries: title = a.text if 'canceled' in title.lower(): continue root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text)
wayback = 'https://web.archive.org/web/20070213073856/' url = wayback + 'http://us.pycon.org:80/apps07/talks/' xpath = '//*[contains(@class, "proposal_list_summary")]/*[not(self::br)]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: if e.tag == 'h2': if talk.title is not None: # Finished one db.add_talk(talk, **data._asdict()) talk = Talk(category=Talk.TALK, conference_id=conference.id) data = db.TalkData([], [], []) talk.title = e.text_content().split('.', 1)[-1].strip() elif e.tag == 'div': talk.abstract = e.text_content().strip() else: # span... tc = e.text_content() if tc.endswith('audio and materials)'): talk.level = tc.split()[1] elif tc.startswith('categories'): data.topic_names.extend(tc.split(':')[-1].split(',')) else: # Speaker names speaker = tc.strip('; ').split('(', 1)[0] data.speaker_names.extend(separators.split(speaker)) data.organization_names.extend(org_matcher.findall(tc)) # don't forget the last one.. if talk.title is not None: db.add_talk(talk, **data._asdict())
if e.tag == 'h2': if talk.title: # new talk db.add_talk(talk, **data._asdict()) talk = Talk(category=Talk.PLENARY, conference_id=conference.id) talk.title = e.text data = db.TalkData([], ["startup"], []) elif e.tag == 'ul': names = [n.split('-') for n in e.xpath('li/text()')] org = names[0][1].strip() people = [n[0].split(',')[0].strip() for n in names] data.speaker_names.extend(people) data.organization_names.append(org) else: if talk.abstract: talk.abstract = '\n'.join((talk.abstract, e.text)) else: talk.abstract = e.text # Last one db.add_talk(talk, **data._asdict()) ## Tutorials ## ==> Ignore these...the links are broken and only the presenters' ## last names are given, so it is hard to create an entry. ## wayback = 'https://web.archive.org/web/20110112152542/' url = wayback + 'http://us.pycon.org:80/2011/schedule/lists/tutorials/' xpath = '//div[@class="session"]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('tutorials')
print(data) print("*" * 10) talk = Talk(category=Talk.TALK, conference_id=conference.id) data = db.TalkData([], [], []) names = e.text_content() if '&' in names or ' and ' in names or ',' in names: names = names_splitter.sub('|', e.text_content()).split('|') data.speaker_names.extend(names) else: data.speaker_names.append(names) else: title = e.findtext('./strong') if title is None: # abstract if talk.abstract is None: talk.abstract = e.text_content() else: talk.abstract = '\n'.join((talk.abstract, e.text_content())) else: talk.title = title # don't forget last one if talk.title is not None: if ('tutorial' in talk.title.lower() or (talk.abstract is not None and 'tutorial' in talk.abstract.lower())): talk.category = Talk.TUTORIAL db.add_talk(talk, **data._asdict()) print(talk) print(data) print("*" * 10)
# Special case 'Vic / Kelson' if tmp.startswith('Vic'): data.speaker_names.append('Vic Kelson') else: speaker_name, org_name = tmp.split('/') if '&' in speaker_name: data.speaker_names.extend(speaker_name.split('&')) else: data.speaker_names.append(speaker_name) data.organization_names.append(org_name) elif '&' in tmp: data.speaker_names.extend(tmp.split('&')) elif ',' in tmp: data.speaker_names.extend(tmp.split(',')) else: if tmp.lower() != talk.title.lower() and len(tmp): data.speaker_names.append(tmp) else: # The abstract if talk.abstract is None: talk.abstract = tmp else: talk.abstract = "\n".join((talk.abstract, tmp)) i += 1 i += 1 counter += 1 print("Finished talk. Counter = {}".format(counter)) print(talk) print(data) db.add_talk(talk, **data._asdict()) print("*" * 10)
xpath = '//div[@id="keynote-talks"]/div[@class="section"]' entries = html.fromstring(requests.get(url).text).xpath(xpath) print('talks') print(url) for e in entries: talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id) data = db.TalkData([], [], []) data.speaker_names.append(e.findtext('h1')) # Split off the abstract, and remove the 'Topic:' prefix tmp = e.xpath('*[text()[contains(.,"Topic")]]') if len(tmp) == 0: talk.title = "Keynote" else: tmp = re.split('[(:]', tmp[0].text_content()[7:].strip(')')) talk.title = tmp[0].strip() talk.abstract = ' '.join(tt for t in tmp[1:] for tt in t.split('\n')) db.add_talk(talk, **data._asdict()) # Tutorials wayback = 'https://web.archive.org/web/20090202113211/' url = wayback + 'http://us.pycon.org:80/2008/tutorials/schedule/' xpath = '//div[@id="content"]//li' entries = html.fromstring(requests.get(url).text).xpath(xpath) # Iterate through and extract the relevant content print('tutorials') print(url) for e in entries: tmp = e.text_content() if 'cancel' in tmp.lower(): continue else:
i = 0 else: title_list = [t.title for t, d in speaker_lookup[speaker]] # Error if not find anything -- so I'll know whether this method is bad try: best_title = difflib.get_close_matches( title, title_list, 1)[0] i = next(i for i, t in enumerate(title_list) if t == best_title) except IndexError: add_new_talk(title, abstract, speaker, topic) continue talk, data = speaker_lookup[speaker].pop(i) if len(speaker_lookup[speaker]) == 0: del speaker_lookup[speaker] talk.abstract = abstract data.topic_names.append(topic) db.add_talk(talk, **data._asdict()) print(talk) print(data) print("*" * 10) else: db.add_talk( Talk(category=Talk.TALK, conference_id=conference.id, title=title, abstract=abstract), **db.TalkData([speaker_name], [topic], [])._asdict()) print("adding new one not in list:", speaker_name, title, "\n***\n")