def usa_down(usa_files):

    usa_path = 'https://americanliterature.com/100-great-short-stories'
    root = 'https://americanliterature.com'

    original_content = prep.get_proxies(usa_path)
    soup = bs(original_content.content, "lxml")
    links = []
    for url in soup.find_all('span'):
        for a in url.find_all('a', href=True):
            links.append(a['href'])

    corpus = []
    for i in tqdm(links):
        more_content = prep.get_proxies(root + str(i))
        soup = bs(more_content.content, "lxml")
        sents = []
        for parag in soup.find_all('p'):
            sents.append(str(parag.string))
        corpus.append(''.join(str(sents)))

    for elem in range(len(corpus)):
        with open(
                os.path.join(
                    usa_files, links[elem].split('/author/')[1].replace(
                        '/short-story/', '__').replace('-', '_') + '.txt'),
                'w') as file:
            file.write(str(corpus[elem]))

    return corpus
def ny_fiction_down(ny_fiction_files):

    ny_fiction_path = 'https://www.newyorker.com/magazine/fiction'
    root = "https://www.newyorker.com"

    links = []
    for i in range(1, 95):
        if i == 1:
            path = ny_fiction_path
        else:
            path = ny_fiction_path + '/page/' + str(i)

        original_content = prep.get_proxies(path)
        soup = bs(original_content.content, "lxml")
        for a in soup.find_all('a', href=True):
            href = str(a['href'])
            if re.search(r'magazine/20\d{2}/\d{2}/\d{2}/\w+', href):
                links.append(href)

    links = list(set(links))

    corpus = []
    names = []
    for url in tqdm(links):
        try:
            url = root + url
            new_content = prep.get_proxies(url)
            soup = bs(new_content.content, "lxml")
            sents = []
            for parag in soup.find_all('p'):
                sents.append(str(parag.string))
            corpus.append(''.join(str(sents)))
            names.append(url)

        except:
            continue

    for elem in tqdm(range(len(corpus))):
        names[elem] = re.sub(r'\-\d+\.*', '', names[elem])
        with open(
                os.path.join(ny_fiction_files,
                             names[elem].split('/')[-1] + '.txt'),
                'w') as file:
            file.write(str(corpus[elem]))

    return corpus
def collins_down(paths):

    site = "https://www.collinsdictionary.com/word-lovers-blog/new/?pageNo="
    root = "https://www.collinsdictionary.com"

    links = []
    page = []
    for i in tqdm(range(66)):
        site1 = site + str(i)
        r_obj = prep.get_proxies(site1)
        soup = bs(r_obj.content, 'lxml')
        for i in soup.find_all('a'):
            tag = i.get('href')
            page.append(tag)

    links = list(set(page))

    collins_corpus = []
    for link in tqdm(links):
        link = str(link)
        if re.match('/', link) and re.search(r'\?pageNo', link) == None:
            link = root + str(link)
            print(link)
        try:
            r_obj1 = prep.get_proxies(link)
            soup1 = bs(r_obj1.content, "lxml")
            strings = []
            corpus = []
            for link1 in soup1.find_all('p'):
                for i in link1.contents:
                    strings.append(str(i.string))
            corpus = ''.join(str(strings))
            file = open(
                os.path.join(paths['collins_path'],
                             link.split('/')[-1].replace('.html', '') +
                             '.txt'), 'w')
            file.write(corpus)
            file.close()

            collins_corpus.append(corpus)
        except:
            continue

    return collins_corpus
예제 #4
0
def adelaide_down(adelaide_files=paths.get('adelaide_path')):

    adelaide_path = "https://ebooks.adelaide.edu.au/meta/titles/"
    root = "https://ebooks.adelaide.edu.au"
    letters = list(string.ascii_uppercase)

    pages = []
    for let in letters:
        pages.append(str(adelaide_path + str(let) + ".html"))

    adelaide_corpus = []
    links = []
    for url in tqdm(pages):
        original_content = prep.get_proxies(url)
        soup = bs(original_content.content, "lxml")
        for link in soup.find_all('a'):
            if re.match(r'/\w/\w+', str(link.get('href'))):
                links.append(str(link.get('href')))

    links = list(set(links))

    for i in tqdm(links):
        name = i
        i = str(root + i)
        try:
            r_obj = prep.get_proxies(os.path.join(i, 'complete.html'))
            soup1 = bs(r_obj.content, "lxml")
            strings = []
            for link1 in soup1.find_all('p'):
                for a in link1.contents:
                    strings.append(str(a.string))
            adelaide_corpus.append(''.join(str(strings)))

        except:
            continue

        for elem in adelaide_corpus:
            with open(
                    os.path.join(adelaide_files,
                                 str(name).replace("/", '_') + '.txt'),
                    'w') as file:
                file.write(elem)

    return adelaide_corpus
예제 #5
0
def news_down(external_path, internal_path, sections):

    themes = []
    for i in sections:
        themes.append(str(external_path + str(i).replace(" ", '')))

    links = []
    for section in tqdm(themes):
        original_content = prep.get_proxies(section)
        soup = bs(original_content.content, "lxml")
        for link in soup.find_all('a'):
            if (re.search(r'\d+', str(link.get('href')))
                    or re.search(r'\.html', str(link.get('href')))
                    or re.search(r'\\', str(link.get('href')))):
                links.append(str(link.get('href')))

    links = list(set(links))

    for link in tqdm(links):
        if re.match('/', link):
            link = external_path + str(link)
        try:
            content1 = prep.get_proxies(link)
            soup1 = bs(content1.content, "lxml")
            strings = []
            corpus = []
            for link1 in soup1.find_all('p'):
                for i in link1.contents:
                    strings.append(str(i.string))
            corpus = ' '.join(strings)
            file = open(
                os.path.join(internal_path,
                             link.split('/')[-1].replace('.html', '') +
                             '.txt'), 'w')
            file.write(corpus)
            file.close()
        except:
            continue

    return corpus
def waccamaw_down(waccamaw_files):

    waccamaw_path = 'http://waccamawjournal.com/category/fiction/'

    links = []
    for i in range(2):
        if i == 0:
            path = waccamaw_path
        else:
            path = waccamaw_path + '/page/2/'

        original_content = prep.get_proxies(path)
        soup = bs(original_content.content, "lxml")
        for a in soup.find_all('a', href=True):
            href = str(a['href'])
            if re.search(r'waccamawjournal.com/fiction/[a-z\-]+', href):
                links.append(href)

    corpus = []
    names = []
    for url in tqdm(links):
        new_content = prep.get_proxies(url)
        soup = bs(new_content.content, "lxml")
        sents = []
        for parag in soup.find_all('p'):
            sents.append(str(parag.string))
        corpus.append(''.join(str(sents)))
        names.append(url)

    for elem in tqdm(range(len(corpus))):
        names[elem] = names[elem].split('fiction/')[1].replace('-', '_')
        with open(
                os.path.join(waccamaw_files,
                             names[elem].split('/')[0] + '.txt'), 'w') as file:
            file.write(str(corpus[elem]))

    return corpus
def electric_down(electric_files):

    electric_path = 'https://electricliterature.com/recommended-reading-archives-7eb326fa8cf4'

    original_content = prep.get_proxies(electric_path)
    soup = bs(original_content.content, "lxml")
    links = []
    for a in soup.find_all('a', href=True):
        links.append(a['href'])

    links = list(set(links))

    corpus = []
    urls = []
    for url in tqdm(links):
        try:
            new_content = prep.get_proxies(url)
            soup = bs(new_content.content, "lxml")
            sents = []
            for parag in soup.find_all('p'):
                sents.append(str(parag.string))
            corpus.append(''.join(str(sents)))
            urls.append(url)

        except:
            continue

    for elem in range(len(corpus)):
        urls[elem] = re.sub(r'\-\d+\.*', '', urls[elem])
        with open(
                os.path.join(
                    electric_files,
                    urls[elem].split('/')[-1].replace('-', '_') + '.txt'),
                'w') as file:
            file.write(str(corpus[elem]))

    return corpus
def oxford_down(paths):

    oxford_corpus = []
    for num in range(1, 242):
        if num == 1:
            site = 'https://blog.oxforddictionaries.com'
        else:
            site = 'https://blog.oxforddictionaries.com/page/' + str(num) + '/'

        r_obj = prep.get_proxies(site)
        soup = bs(r_obj.text, 'lxml')

        tags = soup.find_all('a')
        tags = [
            tag.get('href') for tag in tags if re.search(
                r'(https://blog.oxforddictionaries.com/)(\d+/\d+/\d+/\w+)',
                str(tag))
        ]

        for tag in list(set(tags)):
            print(tag)
            r = requests.get(tag).text
            soup1 = bs(r, 'lxml')

            corpus = []
            for link0 in soup1.find_all(
                    'div', {'class': 'gen-post-content add-post-content'}):
                for link1 in link0.find_all('p'):
                    for i in link1.contents:
                        if not re.match(r'\<img', str(i.string)):
                            corpus.append(str(i.string))

            corpus = ' '.join(corpus)

            if re.search(r'/', soup1.title.string):
                soup1.title.string = re.sub(r'/', '#', soup1.title.string)

            with open(
                    os.path.join(paths['oxford_path'], ('_'.join(
                        soup1.title.string.split(' -')[0].lower().split(' ')) +
                                                        '.txt')), 'w') as f:
                f.write(corpus)

            oxford_corpus.append(corpus)

    return oxford_corpus
def resource_down(resource_files):

    resource_website = 'https://www.everywritersresource.com/shortstories/page/'

    allUrls = []
    for num in tqdm(range(1, 53)):
        site = resource_website + str(num) + '/'
        r_obj = prep.get_proxies(site)
        soup = bs(r_obj.text, 'lxml')
        tags = soup.find_all('a')
        urls = list(
            set([
                tag.get('href') for tag in tags if ('title' in tag.attrs and (
                    'rel' not in tag.attrs or tag.get('rel') == ['bookmark']))
            ]))
        allUrls.append(urls)

    allUrls = list(set([url for lurl in allUrls for url in lurl]))

    resource_corpus = []
    names = []
    for url in tqdm(allUrls):
        content = requests.get(url).content
        soup = bs(content, 'lxml')
        strings = []
        try:
            for link0 in soup.find_all('div', {'class': 'post-content'}):
                for link1 in link0.find_all('p'):
                    for i in link1.contents:
                        strings.append(str(i.string))
            resource_corpus.append(''.join(str(strings)))
            names.append(url)

        except:
            continue

    for elem in range(len(resource_corpus)):
        file = open(
            os.path.join(resource_files, names[elem].split('/')[-2] + '.txt'),
            'w')
        file.write(resource_corpus[elem])
        file.close()

    return resource_corpus