def usa_down(usa_files): usa_path = 'https://americanliterature.com/100-great-short-stories' root = 'https://americanliterature.com' original_content = prep.get_proxies(usa_path) soup = bs(original_content.content, "lxml") links = [] for url in soup.find_all('span'): for a in url.find_all('a', href=True): links.append(a['href']) corpus = [] for i in tqdm(links): more_content = prep.get_proxies(root + str(i)) soup = bs(more_content.content, "lxml") sents = [] for parag in soup.find_all('p'): sents.append(str(parag.string)) corpus.append(''.join(str(sents))) for elem in range(len(corpus)): with open( os.path.join( usa_files, links[elem].split('/author/')[1].replace( '/short-story/', '__').replace('-', '_') + '.txt'), 'w') as file: file.write(str(corpus[elem])) return corpus
def ny_fiction_down(ny_fiction_files): ny_fiction_path = 'https://www.newyorker.com/magazine/fiction' root = "https://www.newyorker.com" links = [] for i in range(1, 95): if i == 1: path = ny_fiction_path else: path = ny_fiction_path + '/page/' + str(i) original_content = prep.get_proxies(path) soup = bs(original_content.content, "lxml") for a in soup.find_all('a', href=True): href = str(a['href']) if re.search(r'magazine/20\d{2}/\d{2}/\d{2}/\w+', href): links.append(href) links = list(set(links)) corpus = [] names = [] for url in tqdm(links): try: url = root + url new_content = prep.get_proxies(url) soup = bs(new_content.content, "lxml") sents = [] for parag in soup.find_all('p'): sents.append(str(parag.string)) corpus.append(''.join(str(sents))) names.append(url) except: continue for elem in tqdm(range(len(corpus))): names[elem] = re.sub(r'\-\d+\.*', '', names[elem]) with open( os.path.join(ny_fiction_files, names[elem].split('/')[-1] + '.txt'), 'w') as file: file.write(str(corpus[elem])) return corpus
def collins_down(paths): site = "https://www.collinsdictionary.com/word-lovers-blog/new/?pageNo=" root = "https://www.collinsdictionary.com" links = [] page = [] for i in tqdm(range(66)): site1 = site + str(i) r_obj = prep.get_proxies(site1) soup = bs(r_obj.content, 'lxml') for i in soup.find_all('a'): tag = i.get('href') page.append(tag) links = list(set(page)) collins_corpus = [] for link in tqdm(links): link = str(link) if re.match('/', link) and re.search(r'\?pageNo', link) == None: link = root + str(link) print(link) try: r_obj1 = prep.get_proxies(link) soup1 = bs(r_obj1.content, "lxml") strings = [] corpus = [] for link1 in soup1.find_all('p'): for i in link1.contents: strings.append(str(i.string)) corpus = ''.join(str(strings)) file = open( os.path.join(paths['collins_path'], link.split('/')[-1].replace('.html', '') + '.txt'), 'w') file.write(corpus) file.close() collins_corpus.append(corpus) except: continue return collins_corpus
def adelaide_down(adelaide_files=paths.get('adelaide_path')): adelaide_path = "https://ebooks.adelaide.edu.au/meta/titles/" root = "https://ebooks.adelaide.edu.au" letters = list(string.ascii_uppercase) pages = [] for let in letters: pages.append(str(adelaide_path + str(let) + ".html")) adelaide_corpus = [] links = [] for url in tqdm(pages): original_content = prep.get_proxies(url) soup = bs(original_content.content, "lxml") for link in soup.find_all('a'): if re.match(r'/\w/\w+', str(link.get('href'))): links.append(str(link.get('href'))) links = list(set(links)) for i in tqdm(links): name = i i = str(root + i) try: r_obj = prep.get_proxies(os.path.join(i, 'complete.html')) soup1 = bs(r_obj.content, "lxml") strings = [] for link1 in soup1.find_all('p'): for a in link1.contents: strings.append(str(a.string)) adelaide_corpus.append(''.join(str(strings))) except: continue for elem in adelaide_corpus: with open( os.path.join(adelaide_files, str(name).replace("/", '_') + '.txt'), 'w') as file: file.write(elem) return adelaide_corpus
def news_down(external_path, internal_path, sections): themes = [] for i in sections: themes.append(str(external_path + str(i).replace(" ", ''))) links = [] for section in tqdm(themes): original_content = prep.get_proxies(section) soup = bs(original_content.content, "lxml") for link in soup.find_all('a'): if (re.search(r'\d+', str(link.get('href'))) or re.search(r'\.html', str(link.get('href'))) or re.search(r'\\', str(link.get('href')))): links.append(str(link.get('href'))) links = list(set(links)) for link in tqdm(links): if re.match('/', link): link = external_path + str(link) try: content1 = prep.get_proxies(link) soup1 = bs(content1.content, "lxml") strings = [] corpus = [] for link1 in soup1.find_all('p'): for i in link1.contents: strings.append(str(i.string)) corpus = ' '.join(strings) file = open( os.path.join(internal_path, link.split('/')[-1].replace('.html', '') + '.txt'), 'w') file.write(corpus) file.close() except: continue return corpus
def waccamaw_down(waccamaw_files): waccamaw_path = 'http://waccamawjournal.com/category/fiction/' links = [] for i in range(2): if i == 0: path = waccamaw_path else: path = waccamaw_path + '/page/2/' original_content = prep.get_proxies(path) soup = bs(original_content.content, "lxml") for a in soup.find_all('a', href=True): href = str(a['href']) if re.search(r'waccamawjournal.com/fiction/[a-z\-]+', href): links.append(href) corpus = [] names = [] for url in tqdm(links): new_content = prep.get_proxies(url) soup = bs(new_content.content, "lxml") sents = [] for parag in soup.find_all('p'): sents.append(str(parag.string)) corpus.append(''.join(str(sents))) names.append(url) for elem in tqdm(range(len(corpus))): names[elem] = names[elem].split('fiction/')[1].replace('-', '_') with open( os.path.join(waccamaw_files, names[elem].split('/')[0] + '.txt'), 'w') as file: file.write(str(corpus[elem])) return corpus
def electric_down(electric_files): electric_path = 'https://electricliterature.com/recommended-reading-archives-7eb326fa8cf4' original_content = prep.get_proxies(electric_path) soup = bs(original_content.content, "lxml") links = [] for a in soup.find_all('a', href=True): links.append(a['href']) links = list(set(links)) corpus = [] urls = [] for url in tqdm(links): try: new_content = prep.get_proxies(url) soup = bs(new_content.content, "lxml") sents = [] for parag in soup.find_all('p'): sents.append(str(parag.string)) corpus.append(''.join(str(sents))) urls.append(url) except: continue for elem in range(len(corpus)): urls[elem] = re.sub(r'\-\d+\.*', '', urls[elem]) with open( os.path.join( electric_files, urls[elem].split('/')[-1].replace('-', '_') + '.txt'), 'w') as file: file.write(str(corpus[elem])) return corpus
def oxford_down(paths): oxford_corpus = [] for num in range(1, 242): if num == 1: site = 'https://blog.oxforddictionaries.com' else: site = 'https://blog.oxforddictionaries.com/page/' + str(num) + '/' r_obj = prep.get_proxies(site) soup = bs(r_obj.text, 'lxml') tags = soup.find_all('a') tags = [ tag.get('href') for tag in tags if re.search( r'(https://blog.oxforddictionaries.com/)(\d+/\d+/\d+/\w+)', str(tag)) ] for tag in list(set(tags)): print(tag) r = requests.get(tag).text soup1 = bs(r, 'lxml') corpus = [] for link0 in soup1.find_all( 'div', {'class': 'gen-post-content add-post-content'}): for link1 in link0.find_all('p'): for i in link1.contents: if not re.match(r'\<img', str(i.string)): corpus.append(str(i.string)) corpus = ' '.join(corpus) if re.search(r'/', soup1.title.string): soup1.title.string = re.sub(r'/', '#', soup1.title.string) with open( os.path.join(paths['oxford_path'], ('_'.join( soup1.title.string.split(' -')[0].lower().split(' ')) + '.txt')), 'w') as f: f.write(corpus) oxford_corpus.append(corpus) return oxford_corpus
def resource_down(resource_files): resource_website = 'https://www.everywritersresource.com/shortstories/page/' allUrls = [] for num in tqdm(range(1, 53)): site = resource_website + str(num) + '/' r_obj = prep.get_proxies(site) soup = bs(r_obj.text, 'lxml') tags = soup.find_all('a') urls = list( set([ tag.get('href') for tag in tags if ('title' in tag.attrs and ( 'rel' not in tag.attrs or tag.get('rel') == ['bookmark'])) ])) allUrls.append(urls) allUrls = list(set([url for lurl in allUrls for url in lurl])) resource_corpus = [] names = [] for url in tqdm(allUrls): content = requests.get(url).content soup = bs(content, 'lxml') strings = [] try: for link0 in soup.find_all('div', {'class': 'post-content'}): for link1 in link0.find_all('p'): for i in link1.contents: strings.append(str(i.string)) resource_corpus.append(''.join(str(strings))) names.append(url) except: continue for elem in range(len(resource_corpus)): file = open( os.path.join(resource_files, names[elem].split('/')[-2] + '.txt'), 'w') file.write(resource_corpus[elem]) file.close() return resource_corpus