def scrape_song(url): """ Gets the lyrics and song info from a wiki url. """ soup = scrapekit.handle_url(url) contents = scrape_id_to_div(soup, "Lyrics") if not contents: return None filetext = ''.join(c.text for c in contents) # Check if there is a reprise REPRISE = 'Reprise' reprise = soup.find(id=REPRISE) if reprise: filetext += '\n\n' filetext += REPRISE + ':\n\n' contents = scrape_id_to_div(soup, REPRISE) filetext += ''.join(c.text for c in contents) # Get song title, fix blank spaces for file name songtitle = soup.title.text.split('|')[0] song_text = '' song_text += 'Song: {}\n'.format(songtitle) song_text += get_infobox_info(soup) song_text += '\n\n' song_text += filetext return song_text
def main(): """ Main entry point. """ parser = argparse.ArgumentParser( description= "Get all links for My Little Pony transcripts from mlp.wikia.com.") parser.add_argument('-d', '--download', action="store_true", help="Download all transcripts to text files.") parser.add_argument('url', nargs='?', default=None, help="Add a url to download a single transcript.") args = parser.parse_args() if args.url: print(scrape_transcript(args.url)) exit() soup = scrapekit.handle_url(URL) transcript_links = scrapekit.find_links_by_regex(soup, r'Transcript') print('got past getting transcript links') for t in transcript_links: print(t) if args.download: scrape_transcript(PREFIX + t)
def scrape_transcript(url): ID = 'WikiaArticle' soup = scrapekit.handle_url(url) text = soup.find(id=ID).text filename = TRANSCRIPT_DIR + url.split('/')[-1] scrapekit.ensure_dir(TRANSCRIPT_DIR) scrapekit.write_to_file(filename, text)
def get_rows(urls): rows = [] for url in urls: print('Scraping {}'.format(url)) soup = scrapekit.handle_url(url) table = soup.find('table', {'class': 'listofponies'}) rows.extend(table_to_list(table)) return rows
def scrape_all_songs(): """ Gets all lyrics from all available songs on the wiki. """ print('Scraping all songs from {}'.format(URL)) soup = scrapekit.handle_url(URL) song_elements = [] tables = soup.findAll('table') for t in tables: field_index = scrapekit.get_col_index(t, field_name="Song") if field_index: song_elements.extend(scrapekit.scrape_table_col(t, field_index)) links = [] for element in song_elements: l = element.find('a') if l: links.append(PREFIX + l.attrs.get('href', '')) return links
def main(): """ Main entry point. """ parser = get_parser() args = parser.parse_args() soup = scrapekit.handle_url(URL) spans = soup.findAll('span', {'class': 'mw-headline'}) lessons = [] # Collect lessons text for span in spans: h3 = span.parent dl = h3.findNext('dl') if dl: text = dl.text.encode('utf-8') lessons.append(text) if args.format: print('Formatting text...') for x, lesson in enumerate(lessons[:]): text = '' lines = lesson.split('\n') for i in lines: for i in textwrap.wrap(i, 78): text += i + '\n' lessons[x] = text for i in lessons: print(i) if args.download: filename = scrapekit.DATADIR + 'lessons.txt' with open(filename, 'w') as f: for i in lessons: f.write(i)