Пример #1
0
def scrape_song(url):
    """ Gets the lyrics and song info from a wiki url. """
    soup = scrapekit.handle_url(url)

    contents = scrape_id_to_div(soup, "Lyrics")
    if not contents:
        return None

    filetext = ''.join(c.text for c in contents)

    # Check if there is a reprise
    REPRISE = 'Reprise'

    reprise = soup.find(id=REPRISE)
    if reprise:
        filetext += '\n\n'
        filetext += REPRISE + ':\n\n'

        contents = scrape_id_to_div(soup, REPRISE)
        filetext += ''.join(c.text for c in contents)

    # Get song title, fix blank spaces for file name
    songtitle = soup.title.text.split('|')[0]

    song_text = ''
    song_text += 'Song: {}\n'.format(songtitle)
    song_text += get_infobox_info(soup)
    song_text += '\n\n'
    song_text += filetext

    return song_text
Пример #2
0
def main():
    """ Main entry point. """
    parser = argparse.ArgumentParser(
        description=
        "Get all links for My Little Pony transcripts from mlp.wikia.com.")
    parser.add_argument('-d',
                        '--download',
                        action="store_true",
                        help="Download all transcripts to text files.")
    parser.add_argument('url',
                        nargs='?',
                        default=None,
                        help="Add a url to download a single transcript.")
    args = parser.parse_args()

    if args.url:
        print(scrape_transcript(args.url))
        exit()

    soup = scrapekit.handle_url(URL)
    transcript_links = scrapekit.find_links_by_regex(soup, r'Transcript')
    print('got past getting transcript links')

    for t in transcript_links:
        print(t)

        if args.download:
            scrape_transcript(PREFIX + t)
Пример #3
0
def scrape_transcript(url):
    ID = 'WikiaArticle'
    soup = scrapekit.handle_url(url)
    text = soup.find(id=ID).text
    filename = TRANSCRIPT_DIR + url.split('/')[-1]
    scrapekit.ensure_dir(TRANSCRIPT_DIR)
    scrapekit.write_to_file(filename, text)
Пример #4
0
def get_rows(urls):
    rows = []
    for url in urls:
        print('Scraping {}'.format(url))
        soup = scrapekit.handle_url(url)
        table = soup.find('table', {'class': 'listofponies'})
        rows.extend(table_to_list(table))
    return rows
Пример #5
0
def scrape_all_songs():
    """ Gets all lyrics from all available songs on the wiki. """
    print('Scraping all songs from {}'.format(URL))

    soup = scrapekit.handle_url(URL)
    song_elements = []
    tables = soup.findAll('table')

    for t in tables:
        field_index = scrapekit.get_col_index(t, field_name="Song")

        if field_index:
            song_elements.extend(scrapekit.scrape_table_col(t, field_index))

    links = []
    for element in song_elements:
        l = element.find('a')
        if l:
            links.append(PREFIX + l.attrs.get('href', ''))
    return links
Пример #6
0
def main():
    """ Main entry point. """
    parser = get_parser()
    args = parser.parse_args()

    soup = scrapekit.handle_url(URL)
    spans = soup.findAll('span', {'class': 'mw-headline'})

    lessons = []

    # Collect lessons text
    for span in spans:
        h3 = span.parent
        dl = h3.findNext('dl')
        if dl:
            text = dl.text.encode('utf-8')
            lessons.append(text)

    if args.format:
        print('Formatting text...')

        for x, lesson in enumerate(lessons[:]):
            text = ''
            lines = lesson.split('\n')
            for i in lines:
                for i in textwrap.wrap(i, 78):
                    text += i + '\n'
            lessons[x] = text

    for i in lessons:
        print(i)

    if args.download:
        filename = scrapekit.DATADIR + 'lessons.txt'

        with open(filename, 'w') as f:
            for i in lessons:
                f.write(i)