示例#1
0
文件: scrape.py 项目: zyhuang/ragno
def get_all_chapter_text(link_data, out_dir, tsleep=1, nproc=3, nretry=10):

    os.makedirs(out_dir, exist_ok=True)
    args_list = []
    for x in link_data:
        in_url = x['url']
        out_json = '{}/book{}.chapter{}.json'.format(out_dir, x['book'],
                                                     x['chapter'])
        args_list.append([in_url, out_json, tsleep])

    utils.parallel_call(get_chapter_text,
                        args_list,
                        nproc=nproc,
                        nretry=nretry)
示例#2
0
def get_all_journal_data(all_journal_dict, out_dir,
                         nproc=3, nretry=10, tsleep=1):
    '''Get data of all journals'''

    os.makedirs(out_dir, exist_ok=True)

    args_list = []
    for url in sorted(all_journal_dict):
        journal_abbrev = re.findall(r'/impact-factor-of-(.*?).shtml',
                                    url)[0].lower()
        out_json = '{}/{}.json'.format(out_dir, journal_abbrev)
        args_list.append([url, out_json, tsleep])

    args_list = [x for x in sorted(args_list, key=lambda x: x[0])]
    utils.parallel_call(get_journal_data, args_list, nproc, nretry)
示例#3
0
def get_all_data(in_oid_list,
                 out_dir,
                 nproc=3,
                 nretry=10,
                 tsleep=1,
                 proxy_list=None):

    # load proxy
    proxy_set = set()
    if os.path.isfile(proxy_list):
        for line in open(proxy_list):
            proxy_set.add(line.rstrip().split('\t')[0])

    # load oid list
    oid_list = []
    for line in open(in_oid_list):
        oid = line.rstrip()
        oid_list.append(oid)

    oid_list = oid_list
    args_list = []
    for oid in oid_list:
        oid_name = oid.replace(':', '.').lower()
        out_json = '{}/{}.json.gz'.format(out_dir, oid_name)
        args_list.append([oid, out_json, proxy_set, tsleep])

    data = utils.parallel_call(get_data, args_list, nproc, nretry)
示例#4
0
文件: scrape.py 项目: zyhuang/ragno
def get_all_journal_data(all_journal_dict,
                         out_dir,
                         nproc=3,
                         nretry=10,
                         tsleep=1):
    '''Get data of all journals'''

    os.makedirs(out_dir, exist_ok=True)

    args_list = []
    for subject in sorted(all_journal_dict):
        for url in sorted(all_journal_dict[subject]):
            journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower()
            out_json = '{}/{}.json'.format(out_dir, journal_abbrev)
            args_list.append([url, out_json, tsleep])

    args_list = [x for x in sorted(args_list, key=lambda x: x[0])]
    #     if len(args_list) > 10:
    #         break
    # if len(args_list) > 10:
    #     break

    done_set = set()
    cycle = 0
    while True:
        tmp = utils.parallel_call(get_journal_data, args_list, nproc, nretry)
        cycle += 1

        todo_set = set()
        for x in tmp:
            for url in x['done']:
                done_set.add(url)
            for url in x['todo']:
                todo_set.add(url)

        todo_set = todo_set - done_set
        if len(todo_set) == 0:
            break

        args_list = []
        for url in sorted(todo_set):
            journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower()
            out_json = '{}/{}.json'.format(out_dir, journal_abbrev)
            args_list.append([url, out_json, tsleep])

        utils.qprint('after cycle {}, get {} new journals'.format(
            cycle, len(args_list)))
示例#5
0
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10):

    data = []
    args_list = []
    for i in range(1, 74):
        url = '{}/index.php/?m=bible&template={}'.format(host, i)
        args_list.append([url, tsleep])
    tmp = utils.parallel_call(get_chapter_links,
                              args_list,
                              nproc=nproc,
                              nretry=nretry)
    for x in tmp:
        for y in x:
            data.append(y)

    utils.write_json(data, out_json)
    return data
示例#6
0
文件: scrape.py 项目: zyhuang/ragno
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10):

    data = []
    args_list = []
    for i in range(1, 74):
        url = ('{}/pls/labibbia_new/Bibbia_Utils.elenco_capitoli?'
               'origine=cei2008&idlibroz={}'.format(host, i))
        args_list.append([url, tsleep])
    tmp = utils.parallel_call(get_chapter_links,
                              args_list,
                              nproc=nproc,
                              nretry=nretry)
    for x in tmp:
        for y in x:
            data.append(y)

    utils.write_json(data, out_json)
    return data
示例#7
0
def get_all_journal_links(nproc=3, nretry=10, tsleep=1):
    '''Get urls of all journals'''

    subject_names = [
        'agriculture-and-forestry', 'astronomy', 'biology',
        'chemistry', 'engineering', 'environmental-science', 'geoscience',
        'medicine', 'math', 'management-science', 'physics', 'social-science',
    ]

    subject_list = [
        '{}/{}-journal-impact-factor-list.shtml'.format(host, x)
        for x in subject_names
    ]

    number_list = [
        '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x)
        for x in range(1,91)
    ]

    alphabet_list = [
        '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x)
        for x in ascii_uppercase
    ]

    url_set = set(subject_list)
    url_set.update(set(number_list))
    url_set.update(set(alphabet_list))

    args_list = []
    for url in sorted(url_set):
        args_list.append([url, tsleep])

    tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry)

    data = {}
    for x in tmp:
        for k,v in x.items():
            data[k] = v

    utils.qprint('get urls of {} journals'.format(len(data)))

    return data
示例#8
0
文件: scrape.py 项目: zyhuang/ragno
def get_all_journal_links(nproc=3, nretry=10, tsleep=1):
    '''Get urls of all journals'''

    subject_list = [
        'biology',
        'medicine',
        'agriculture',
        'chemistry',
        'geoscience',
        'astronomy',
        'engineering',
        'management',
        'environmental',
        'math',
        'physics',
        'social',
    ]

    data = {}
    for subject in subject_list:
        npage, error = get_subject_npage(subject)
        args_list = []
        for page in range(1, npage + 1):
            url = ('http://www.bioxbio.com/if/subject/{}-{}.html'.format(
                subject, page))
            args_list.append([url, tsleep])
        tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry)
        data[subject] = {}
        for x in tmp:
            for k, v in x.items():
                data[subject][k] = v

        utils.qprint('get urls of {} journals of subject {}'.format(
            len(data[subject]), subject))

    return data