def add_presentation(url, category):
    print("Collecting from {}".format(url))
    xpath = '//div[contains(@class,"presentation")]/h3/a'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    ## Iterate through and extract the relevant content
    for a in entries:
        title = a.text
        if 'canceled' in title.lower():
            continue
        root = html.fromstring(
            requests.get('https://us.pycon.org' + a.get('href')).text)
        speakers = root.xpath('//h4/a/text()')
        abstract = root.xpath('//div[@class="abstract"]')[0].text_content()
        try:
            level = root.xpath('//dl/dd/text()')[0]
        except ValueError:
            continue
        level = 'Beginner' if level == 'Novice' else level
        talk = Talk(category=category,
                    conference_id=conference.id,
                    title=title)
        data = db.TalkData(speakers, [], [])
        talk.abstract = abstract[:10000]
        talk.level = level
        db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category):
    print("Collecting from {}".format(url))
    xpath = '//td[contains(@class,"slot")]'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    ## Iterate through and extract the relevant content
    for td in entries:
        a = td.find('./span[@class="title"]/a')
        if a is None:
            print('skipping...')
            continue
        title = a.text
        abstract = a.get('title')
        if 'canceled' not in title.lower():
            if 'Jasmine Hsu' in title:
                speakers = title.split(',')
                title = 'Fire, bullets, and productivity'
                level = 'Beginner'
            else:
                speakers = td.findtext('./span[@class="speaker"]').strip()
                speakers = speakers.split(
                    ',') if ',' in speakers else speakers.split('&')
                speakers = [s for s in speakers if s.strip() and not '?' in s]
                level = td.xpath('./comment()')[0].text.splitlines()[1].strip()
                level = 'Beginner' if level == 'Novice' else level
            talk = Talk(category=category,
                        conference_id=conference.id,
                        title=title)
            data = db.TalkData(speakers, [], [])
            talk.abstract = abstract[:10000]
            talk.level = level
            db.add_talk(talk, **data._asdict())
def add_presentation(url, category):
    print("Collecting from {}".format(url))
    xpath = '//div[contains(@class,"box-content")]/*'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    first = next(i for i, e in enumerate(entries) if e.tag == 'h2')
    ## Iterate through and extract the relevant content
    for i in range(int((len(entries) - first) / 3)):
        h2, p, div = entries[first + 3 * i:first + 3 * (1 + i)]
        title = h2.text_content()
        if 'canceled' in title.lower():
            continue
        speakers = p.text_content().strip('\n ').split('\n', 1)[0].split(',')
        speakers = [s for s in speakers if s.strip() and not '?' in s]
        abstract = div.text_content().strip()
        talk = Talk(category=category,
                    conference_id=conference.id,
                    title=title)
        data = db.TalkData(speakers, [], [])
        talk.abstract = abstract[:10000]
        db.add_talk(talk, **data._asdict())
def add_presentation_from_table(url, category):
    print("Collecting from {}".format(url))
    xpath = '//td[contains(@class,"slot")]'
    entries = html.fromstring(requests.get(url).text).xpath(xpath)
    ## Iterate through and extract the relevant content
    for td in entries:
        a = td.find('./span[@class="title"]/a')
        if a is None:
            continue
        title = a.text
        abstract = a.get('title')
        if 'canceled' not in title.lower():
            speakers = td.findtext('./span[@class="speaker"]').strip()
            speakers = speakers.split(
                ',') if ',' in speakers else speakers.split('&')
            level = td.findtext('./span[@class="audience_level"]').strip()
            level = 'Beginner' if level == 'Novice' else level
            talk = Talk(category=category,
                        conference_id=conference.id,
                        title=title)
            data = db.TalkData(speakers, [], [])
            talk.abstract = abstract[:10000]
            talk.level = level
            db.add_talk(talk, **data._asdict())
db.session.commit()


## Talks
keynotes = (
    (['Guido van Rossum'], 'Update on the state of Python', None),
    (['Steve Huffman', 'Alexis Ohanian'], 'Reddit', "Reddit's origin and the switch to Python")
)
for speaker_names, title, abstract in keynotes:
    talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id)
    talk.title = title
    if title == 'Reddit':
        data.organization_names.append('Reddit')
    if abstract:
        talk.abstract = abstract
    data = db.TalkData(speaker_names, [], [])
    db.add_talk(talk, **data._asdict())
    

## Tutorials
##  ==> Ignore these...the links are broken and only the presenters'
##      last names are given, so it is hard to create an entry.
##
#wayback = 'https://web.archive.org/web/20090518174359/'
#url = wayback + 'http://us.pycon.org:80/2009/tutorials/schedule'
#xpath = '//div[@id="tutorials"]//li'
#entries = html.fromstring(requests.get(url).text).xpath(xpath)
## Iterate through and extract the relevant content
#for e in entries:
#    tmp = e.text_content()
예제 #6
0
xpath = '//div[contains(@class,"presentation")]'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('tutorials')
print(url)
## Iterate through and extract the relevant content
for e in entries:
    a = e.find('./h3/a')
    title = a.text
    root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text)
    abstract = root.xpath('//div[@class="abstract"]')[0].text_content()
    speakers = root.xpath('//h4/a/text()')
    level, category = root.xpath('//dl/dd/text()')
    level = 'Beginner' if level == 'Novice' else level
    talk = Talk(category=Talk.TUTORIAL, conference_id=conference.id, title=title)
    data = db.TalkData(speakers, [category], [])
    talk.abstract = abstract[:10000]
    talk.level = level
    db.add_talk(talk, **data._asdict())


url = 'https://us.pycon.org/2013/schedule/talks/list/'
xpath = '//div[contains(@class,"presentation")]/h3/a'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
# Iterate through and extract the relevant content
for a in entries:
    title = a.text
    if 'canceled' in title.lower():
        continue
    root = html.fromstring(requests.get('https://us.pycon.org'+a.get('href')).text)
wayback = 'https://web.archive.org/web/20070213073856/'
url = wayback + 'http://us.pycon.org:80/apps07/talks/'
xpath = '//*[contains(@class, "proposal_list_summary")]/*[not(self::br)]'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
for e in entries:
    if e.tag == 'h2':
        if talk.title is not None:
            # Finished one
            db.add_talk(talk, **data._asdict())
            talk = Talk(category=Talk.TALK, conference_id=conference.id)
            data = db.TalkData([], [], [])
        talk.title = e.text_content().split('.', 1)[-1].strip()
    elif e.tag == 'div':
        talk.abstract = e.text_content().strip()
    else:  # span...
        tc = e.text_content()
        if tc.endswith('audio and materials)'):
            talk.level = tc.split()[1]
        elif tc.startswith('categories'):
            data.topic_names.extend(tc.split(':')[-1].split(','))
        else:  # Speaker names
            speaker = tc.strip('; ').split('(', 1)[0]
            data.speaker_names.extend(separators.split(speaker))
            data.organization_names.extend(org_matcher.findall(tc))

# don't forget the last one..
if talk.title is not None:
    db.add_talk(talk, **data._asdict())
    if e.tag == 'h2':
        if talk.title:
            # new talk
            db.add_talk(talk, **data._asdict())
            talk = Talk(category=Talk.PLENARY, conference_id=conference.id)
        talk.title = e.text
        data = db.TalkData([], ["startup"], [])
    elif e.tag == 'ul':
        names = [n.split('-') for n in e.xpath('li/text()')]
        org = names[0][1].strip()
        people = [n[0].split(',')[0].strip() for n in names]
        data.speaker_names.extend(people)
        data.organization_names.append(org)
    else:
        if talk.abstract:
            talk.abstract = '\n'.join((talk.abstract, e.text))
        else:
            talk.abstract = e.text

# Last one
db.add_talk(talk, **data._asdict())

## Tutorials
##  ==> Ignore these...the links are broken and only the presenters'
##      last names are given, so it is hard to create an entry.
##
wayback = 'https://web.archive.org/web/20110112152542/'
url = wayback + 'http://us.pycon.org:80/2011/schedule/lists/tutorials/'
xpath = '//div[@class="session"]'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('tutorials')
            print(data)
            print("*" * 10)
            talk = Talk(category=Talk.TALK, conference_id=conference.id)
            data = db.TalkData([], [], [])
        names = e.text_content()
        if '&' in names or ' and ' in names or ',' in names:
            names = names_splitter.sub('|', e.text_content()).split('|')
            data.speaker_names.extend(names)
        else:
            data.speaker_names.append(names)
    else:
        title = e.findtext('./strong')
        if title is None:
            # abstract
            if talk.abstract is None:
                talk.abstract = e.text_content()
            else:
                talk.abstract = '\n'.join((talk.abstract, e.text_content()))
        else:
            talk.title = title

# don't forget last one
if talk.title is not None:
    if ('tutorial' in talk.title.lower() or
        (talk.abstract is not None and 'tutorial' in talk.abstract.lower())):
        talk.category = Talk.TUTORIAL
    db.add_talk(talk, **data._asdict())
    print(talk)
    print(data)
    print("*" * 10)
                    # Special case 'Vic / Kelson'
                    if tmp.startswith('Vic'):
                        data.speaker_names.append('Vic Kelson')
                    else:
                        speaker_name, org_name = tmp.split('/')
                        if '&' in speaker_name:
                            data.speaker_names.extend(speaker_name.split('&'))
                        else:
                            data.speaker_names.append(speaker_name)
                        data.organization_names.append(org_name)
            elif '&' in tmp:
                data.speaker_names.extend(tmp.split('&'))
            elif ',' in tmp:
                data.speaker_names.extend(tmp.split(','))
            else:
                if tmp.lower() != talk.title.lower() and len(tmp):
                    data.speaker_names.append(tmp)
        else: # The abstract
            if talk.abstract is None:
                talk.abstract = tmp
            else:
                talk.abstract = "\n".join((talk.abstract, tmp))
        i += 1
    i += 1
    counter += 1
    print("Finished talk. Counter = {}".format(counter))
    print(talk)
    print(data)
    db.add_talk(talk, **data._asdict())
    print("*" * 10)
xpath = '//div[@id="keynote-talks"]/div[@class="section"]'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
print('talks')
print(url)
for e in entries:
    talk = Talk(category=Talk.KEYNOTE, conference_id=conference.id)
    data = db.TalkData([], [], [])
    data.speaker_names.append(e.findtext('h1'))
    # Split off the abstract, and remove the 'Topic:' prefix
    tmp = e.xpath('*[text()[contains(.,"Topic")]]')
    if len(tmp) == 0:
        talk.title = "Keynote"
    else:
        tmp = re.split('[(:]', tmp[0].text_content()[7:].strip(')'))
        talk.title = tmp[0].strip()
        talk.abstract = ' '.join(tt for t in tmp[1:] for tt in t.split('\n'))
    db.add_talk(talk, **data._asdict())

# Tutorials
wayback = 'https://web.archive.org/web/20090202113211/'
url = wayback + 'http://us.pycon.org:80/2008/tutorials/schedule/'
xpath = '//div[@id="content"]//li'
entries = html.fromstring(requests.get(url).text).xpath(xpath)
# Iterate through and extract the relevant content
print('tutorials')
print(url)
for e in entries:
    tmp = e.text_content()
    if 'cancel' in tmp.lower():
        continue
    else:
예제 #12
0
                    i = 0
                else:
                    title_list = [t.title for t, d in speaker_lookup[speaker]]
                    # Error if not find anything -- so I'll know whether this method is bad
                    try:
                        best_title = difflib.get_close_matches(
                            title, title_list, 1)[0]
                        i = next(i for i, t in enumerate(title_list)
                                 if t == best_title)
                    except IndexError:
                        add_new_talk(title, abstract, speaker, topic)
                        continue
                talk, data = speaker_lookup[speaker].pop(i)
                if len(speaker_lookup[speaker]) == 0:
                    del speaker_lookup[speaker]
                talk.abstract = abstract
                data.topic_names.append(topic)
                db.add_talk(talk, **data._asdict())
                print(talk)
                print(data)
                print("*" * 10)
            else:
                db.add_talk(
                    Talk(category=Talk.TALK,
                         conference_id=conference.id,
                         title=title,
                         abstract=abstract),
                    **db.TalkData([speaker_name], [topic], [])._asdict())
                print("adding new one not in list:", speaker_name, title,
                      "\n***\n")