示例#1
0
def get_article_link(journal_name, url):
    volume_num, issue_num = url.split('/')[-2], url.split('/')[-1]

    print(volume_num, issue_num.replace('\n', ''), journal_name)

    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.text, "html.parser")

    div_tags = soup('div',
                    class_='card issue-items-container bulkDownloadWrapper')

    for div_tag in div_tags:
        # print(div_tag.find('h3').string == 'Original Articles')
        # if div_tag.find('h3').string == 'Articles' or div_tag.find('h3').string == 'SPECIAL ISSUE ARTICLES' \
        #         or div_tag.find('h3').string == 'Original Articles' or div_tag.find('h3').string == 'Original Article' \
        #         or div_tag.find('h3').string == 'HR Science Forum' or div_tag.find('h3').string == 'Research Articles' \
        #         or div_tag.find('h3').string == 'RESEARCH ARTICLES':
        div_tag2 = div_tag.find_all('div', class_='issue-item')

        for i in range(len(div_tag2)):
            article_url = base_url + div_tag2[i].find('a')['href']
            with open("../resources/txt/Wiley/" +
                      sd.handle_journal_name_without_capital(journal_name) +
                      '.txt',
                      "a",
                      encoding='utf-8') as f2:
                f2.write(article_url + ' Volume' + volume_num + "_Issue" +
                         issue_num)
示例#2
0
def get_article(journal_name, journal_code):

    ind = 0

    txt_path = '../resources/txt/Wiley/' + sd.handle_journal_name_without_capital(
        journal_name) + '.txt'

    with open(txt_path, 'r', encoding='utf-8') as f:
        urls = f.readlines()
        for url in urls:

            article_link, volume, issue = url.split(' ')[0], url.split(
                ' ')[1].split('_')[0], url.split(' ')[1].split('_')[1].replace(
                    '\n', '')

            article_path = '../../../../paper data/Wiley/' + journal_name + '/'

            response = requests.get(article_link, headers=headers)
            if response.status_code == 200:
                response.encoding = 'utf-8'
                file_name = journal_code + str(
                    ind) + '_' + volume + '_' + issue + '.txt'
                with open(article_path + file_name, "w",
                          encoding='utf-8') as ff:
                    ind = ind + 1
                    ff.write(response.text)
                    print(journal_name + ' ' + str(ind) + ' ' + article_link)
                    with open('../resources/log/' + journal_name + '.log',
                              "a",
                              encoding='utf-8') as flog:
                        flog.write(article_link + ' ' + "Article_num: " +
                                   str(ind) + '\n')

                    flog.close()
示例#3
0
def get_volume_link(journal_name, url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    div_tags = soup.find_all('div', class_='cover-image__image hasDetails')

    volume_max, year_max = -1, -1

    for div_tag in div_tags:
        year = div_tag.find('a')['href'].split('/')[-3]
        volume = div_tag.find('a')['href'].split('/')[-2]
        year_max, volume_max = max(int(year),
                                   year_max), max(int(volume), volume_max)

    journal_code = url.split('/')[-1]

    while year_max != 1995:
        for k in range(1, 15):
            volume_url = base_url + '/toc/' + journal_code + '/' + str(
                year_max) + '/' + str(volume_max) + '/' + str(k)

            response = requests.get(volume_url, headers=headers)
            print(volume_url, response.status_code)
            if response.status_code == 200:
                with open(
                        sd.handle_journal_name_without_capital(journal_name) +
                        '.txt',
                        "a",
                        encoding='utf-8') as f1:
                    f1.write(volume_url + '\n')

        year_max = year_max - 1
        volume_max = volume_max - 1
def get_article_cn(journal_name, article_url, journal_num, volume_info, issue_info):
    headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko)'
            ' Chrome/80.0.3987.162 Safari/537.36'
    }
    try:
        # 下载成功
        response = requests.get(article_url, headers=headers)
        if response.status_code == 200:
            response.encoding = 'utf-8'

            file_name = journal_code[journal_name] + str(journal_num) + '_' + volume_info + '_' + issue_info + '.html'
            print("CN文件 " + article_url + ' ' + str(journal_num))
            path = "../../../../paper data/SAGE/" + journal_name + '/cn/'
            with open(path + file_name, "w", encoding='utf-8') as f:
                f.write(response.text)
                with open("../resources/log/" + sd.handle_journal_name_without_capital(journal_name) + '.log',
                          "a", encoding='utf-8') as f_log:
                    f_log.write(article_url + ' Article_num:' + str(journal_num) + '\n')
                f_log.close()
        return 1
        # 下载失败
    except requests.ConnectionError as e:
        print("Error", e.args)

    return 0
def get_article(journal_name):
    headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko)'
            ' Chrome/80.0.3987.162 Safari/537.36'
    }

    journal_num = 1
    path = "../resources/txt/" + sd.handle_journal_name_without_capital(journal_name) + ".txt"
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            url, date_info = line.split(' ')[0], line.split(' ')[1]
            volume_info = date_info.split('_')[0]
            issue_info = date_info.split('_')[1].replace('\n', '')
            print(url, volume_info, issue_info)
            try:
                response = requests.get(url, headers=headers)

                soup = BeautifulSoup(response.text, "html.parser")
                a_tags = soup.find_all('a')

                for a_tag in a_tags:
                    is_data_item_exist = bool('data-item-name' in a_tag.attrs)
                    # 可以直接下载
                    if is_data_item_exist and a_tag.attrs['data-item-name'] == 'download-PDF' and a_tag['href'][0] != '#':
                        journal_num = journal_num + get_article_en(journal_name, response.text, url, journal_num,
                                                                   volume_info, issue_info)
                    # 需要使用机构接口获得新的url以下载
                    elif is_data_item_exist and a_tag.attrs['data-item-name'] == 'cnp-link':
                        journal_num = journal_num + get_article_cn(journal_name, a_tag['href'], journal_num, volume_info, issue_info)
            except requests.ConnectionError as e:
                print("Error", e.args)
def get_article_en(journal_name, text, article_url, journal_num, volume_info, issue_info):
    if text != '':
        file_name = journal_code[journal_name] + str(journal_num) + '_' + volume_info + '_' + issue_info + '.html'
        print("EN文件 " + article_url + ' ' + str(journal_num))
        path = "../../../../paper data/SAGE/" + journal_name + "/en/"
        with open(path + file_name, "w", encoding='utf-8') as f:
            f.write(text)
            with open("../resources/log/" + sd.handle_journal_name_without_capital(journal_name) + '.log',
                      "a", encoding='utf-8') as f_log:
                f_log.write(article_url + ' Article_num:' + str(journal_num) + '\n')
            f_log.close()
            return 1
    else:
        return 0
def get_volume_link(journal_name):

    url = 'https://journals.sagepub.com/loi/etpb?%20-%20201918&expanded=2017&expanded=2016&expanded=2015&expanded=2014&expanded=2013&expanded=2012&expanded=2011%20-%202009&expanded=2008&expanded=2007&expanded=2006&expanded=2005&expanded=2004&expanded=2003&expanded=2002&expanded=2001&expanded=26&expanded=25&expanded=24&expanded=1990%20-%201999&expanded=1998&expanded=1997&expanded=1996&expanded=22&expanded=28'

    headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko)'
            ' Chrome/80.0.3987.162 Safari/537.36'
    }

    ind = 1
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    volume_links = soup.find_all('a', class_='issue-link h6')

    for volume_link in volume_links:
        print(base_url + volume_link['href'])

    for volume_link in volume_links:
        volume_url = base_url + volume_link['href']
        try:
            year = list(volume_link.next_siblings)[1].string.split(' ')[1]
            print(list(volume_link.next_siblings))
        except IndexError as e:
            print("Error", e.args)

        if int(year) < 1996:
            break
        volume, issue = volume_url.split('/')[-2], volume_url.split('/')[-1]

        try:
            response = requests.get(volume_url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            elements = soup.find_all('div', class_='art_title linkable')
            for element in elements:
                if element.previous_sibling.string == 'Articles' or element.previous_sibling.string == 'Article':
                    href = element.find('a')['href']
                    article_url = base_url + href
                    with open("../resources/txt/" + sd.handle_journal_name_without_capital(journal_name) +
                              '.txt', "a", encoding='utf-8') as f:
                        f.write(article_url + ' Volume' + volume + '_Issue' + issue + ' ' + str(ind) + '\n')
                        print("Volume: " + volume + ' Issue: ' + issue + ' ' + str(ind))
                        ind = ind + 1
        except requests.ConnectionError as e:
            print("Error", e.args)