示例#1
0
def get_chapter_links(in_url, tsleep):

    data = []
    error = None

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise Exception('can not get url ' + in_url)
        time.sleep(tsleep)

        p = (r'<a href="/index.php/\?m=bible&template=(.*?)&chapter=(.*?)"  '
             '>(.*?)</a>')
        for x in re.findall(p, r.text):
            book, chapter, title = x
            url = ('{}/index.php/?m=bible&template={}&chapter={}'.format(
                host, book, chapter))
            out = {
                'book': book,
                'chapter': chapter,
                'title': title,
                'url': url,
            }
            data.append(out)

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = []
        raise

    return data, error
示例#2
0
文件: scrape.py 项目: zyhuang/ragno
def get_chapter_links(in_url, tsleep):

    data = []
    error = None

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise Exception('can not get url ' + in_url)
        time.sleep(tsleep)

        book = re.findall('&idlibroz=(.*?)$', in_url)[0]
        p = (r'<a id="ext" href="(.*?)" alt="altri capitoli del '
             'libro">(.*?)</a>')

        for x in re.findall(p, r.text):
            url, chapter = x
            y = re.findall(r'Libro=(.*?)&capitolo=(.*?)&', url)
            title, chapter = y[0]
            url = host + url
            out = {
                'book': book,
                'chapter': chapter,
                'title': title,
                'url': url,
            }
            data.append(out)

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = []
        raise

    return data, error
示例#3
0
def get_journal_links(in_url, tsleep):
    '''Get all journal links under input URL
    https://www.scijournal.org/list-of-impact-factor-journal_A.shtml
    '''

    data = {}
    error = None
    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise
        time.sleep(tsleep)
        text = etree.HTML(r.text)
        for a in text.xpath('//center/h2/a'):
            title = a.text
            url = a.get('href')
            if not url.startswith('impact-factor-of-'):
                continue
            url = urljoin(host, url)
            data[url] = title

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = {}

    return data, error
示例#4
0
文件: scrape.py 项目: zyhuang/ragno
def get_journal_links(in_url, tsleep):
    '''Get all journal links under input URL
    http://www.bioxbio.com/if/subject/{subject}-{n}.html
    '''

    data = {}
    error = None
    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise
        time.sleep(tsleep)
        text = etree.HTML(r.text)
        for a in text.xpath('//tr/td/a'):
            title = a.text
            url = a.get('href')
            url = urljoin(in_url, url)
            data[url] = title

    except Exception as e:
        error = '*ERROR* ' + str(e)
        raise
        data = {}

    return data, error
示例#5
0
def get_chapter_text(in_url, out_json, tsleep):

    data = {}
    error = None

    if os.path.isfile(out_json):
        return data, error

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise Exception('can not get url ' + in_url)
        time.sleep(tsleep)
        text = etree.HTML(r.text)

        data['url'] = in_url

        pattern = (r'<span class="chapter_title"> (.*?) <i class="glyphicon '
                   'glyphicon-chevron-right "> </i> (.*?) <i class="glyphicon '
                   'glyphicon-chevron-right"> </i> (.*?)</span>')
        x = re.findall(pattern, r.text)
        if x:
            data['volume'] = x[0][0]
            data['book'] = x[0][1]
            data['chapter'] = x[0][2]
        else:
            data['volume'] = None
            data['book'] = None
            data['chapter'] = None

        phrase_set = set()
        data['content'] = []
        for x in text.xpath('//div[@id="bible_chapter_content"]/*'):
            t = {'vers': None, 'text': None}
            if x.tag == 'p':
                t = {'vers': x.get('value'), 'text': x.text.split('  ', 1)[-1]}
            else:
                t = {'vers': '', 'text': x.text}
            if t['vers'] == None and t['text'] == None:
                raise Exception(
                    'can not extract content from "{}" (url={})'.format(
                        x.text, in_url))
            # avoid duplicate entry
            phrase = '|{}|{}|'.format(t['vers'], t['text'])
            if phrase not in phrase_set:
                data['content'].append(t)
                phrase_set.add(phrase)

        utils.write_json(data, out_json)
        data = {}

    except Exception as e:
        error = '*ERROR* ' + str(e)
        raise
        data = {}

    return data, error
示例#6
0
def print_csv_short_list():

    data = utils.load_json('../data/short_list/omim_diseases.json')

    fname_out = 'csv/omim_diseases.csv'
    utils.qprint('writing ' + fname_out)
    fout = open(fname_out, 'w')

    print('oid,name_cn,name_en', file=fout)
    for k,v in sorted(data.items()):
        print('OMIM:'+k, v['name_cn'].replace(',',''),
              v['name_en'].replace(',',''), sep=',', file=fout)
    fout.close()
示例#7
0
文件: scrape.py 项目: zyhuang/ragno
def get_chapter_text(in_url, out_json, tsleep):

    data = {}
    error = None

    # if os.path.isfile(out_json):
    #     return data, error

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise Exception('can not get url ' + in_url)
        time.sleep(tsleep)
        text = etree.HTML(r.text)

        data['url'] = in_url
        x = text.xpath('//div[@class="libro"]')
        data['book'] = x[0].text if x else None

        x = text.xpath('//div[@id="capitolo"]')
        data['chapter'] = x[0].text.strip() if x else None

        data['content'] = []
        el = text.xpath('//div[@class="testidellibro"]')[0]
        s = html.unescape(etree.tostring(el).decode('utf-8'))
        for line in s.split('\n'):
            if not line.startswith('<sup>'):
                continue
            # x = re.findall(r'<sup>.*?<a .*?>(.*?)</a></sup>(.*?)$', line)
            x = re.findall(r'<a (.*?)>(.*?)</a></sup>(.*?)$', line)
            if not x:
                continue
            atag, vers, phrase = x[0]
            vers = re.findall(r'name="VER_(.*?)"', atag)[0]
            phrase = re.sub(r'<.*?>', ' ', phrase)
            phrase = re.sub(r'\t', ' ', phrase)
            phrase = re.sub(r' +', ' ', phrase)
            t = {'vers': vers, 'text': phrase.strip()}
            data['content'].append(t)

        utils.write_json(data, out_json)
        data = {}

    except Exception as e:
        error = '*ERROR* ' + str(e)
        raise
        data = {}

    return data, error
示例#8
0
文件: scrape.py 项目: zyhuang/ragno
def get_all_journal_data(all_journal_dict,
                         out_dir,
                         nproc=3,
                         nretry=10,
                         tsleep=1):
    '''Get data of all journals'''

    os.makedirs(out_dir, exist_ok=True)

    args_list = []
    for subject in sorted(all_journal_dict):
        for url in sorted(all_journal_dict[subject]):
            journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower()
            out_json = '{}/{}.json'.format(out_dir, journal_abbrev)
            args_list.append([url, out_json, tsleep])

    args_list = [x for x in sorted(args_list, key=lambda x: x[0])]
    #     if len(args_list) > 10:
    #         break
    # if len(args_list) > 10:
    #     break

    done_set = set()
    cycle = 0
    while True:
        tmp = utils.parallel_call(get_journal_data, args_list, nproc, nretry)
        cycle += 1

        todo_set = set()
        for x in tmp:
            for url in x['done']:
                done_set.add(url)
            for url in x['todo']:
                todo_set.add(url)

        todo_set = todo_set - done_set
        if len(todo_set) == 0:
            break

        args_list = []
        for url in sorted(todo_set):
            journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower()
            out_json = '{}/{}.json'.format(out_dir, journal_abbrev)
            args_list.append([url, out_json, tsleep])

        utils.qprint('after cycle {}, get {} new journals'.format(
            cycle, len(args_list)))
示例#9
0
def get_journal_data(in_url, out_json, tsleep):
    '''Get journal data (name, issn, impact factors) from input URL
    https://www.scijournal.org/impact-factor-of-ACM-SIGPLAN-NOTICES.shtml
    and get URL of next/previous journals
    '''

    data = {}
    error = None

    if os.path.isfile(out_json):
        return data, error

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise
        time.sleep(tsleep)
        root = etree.HTML(r.text)

        data = {}
        data['url'] = in_url
        x = root.xpath('//h1')[0].text
        data['title'] = x.replace('Impact Factor','').strip()

        x = re.findall(r'Journal Abbreviation: (.*?)<br>', r.text)
        data['title_abbrev'] = x[0] if x else None

        x = re.findall('Journal ISSN: (.*?)$', r.text, re.MULTILINE)
        data['issn'] = x[0] if x else None

        data['impact'] = {}
        years = ['2016/2017']
        for i in range(2008, 2016):
            years.append(str(i))
        for year in years:
            x = re.findall(r'{} Impact Factor : (.*?)<br>'.format(year), r.text)
            data['impact'][year] = x[0] if x else '-NA-'

        utils.write_json(data, out_json)

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = {}

    return data, error
示例#10
0
def get_all_journal_links(nproc=3, nretry=10, tsleep=1):
    '''Get urls of all journals'''

    subject_names = [
        'agriculture-and-forestry', 'astronomy', 'biology',
        'chemistry', 'engineering', 'environmental-science', 'geoscience',
        'medicine', 'math', 'management-science', 'physics', 'social-science',
    ]

    subject_list = [
        '{}/{}-journal-impact-factor-list.shtml'.format(host, x)
        for x in subject_names
    ]

    number_list = [
        '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x)
        for x in range(1,91)
    ]

    alphabet_list = [
        '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x)
        for x in ascii_uppercase
    ]

    url_set = set(subject_list)
    url_set.update(set(number_list))
    url_set.update(set(alphabet_list))

    args_list = []
    for url in sorted(url_set):
        args_list.append([url, tsleep])

    tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry)

    data = {}
    for x in tmp:
        for k,v in x.items():
            data[k] = v

    utils.qprint('get urls of {} journals'.format(len(data)))

    return data
示例#11
0
def print_csv_full_data():

    fname_out = 'csv/full_list.csv'
    utils.qprint('writing ' + fname_out)
    fout = open(fname_out, 'w')

    print('oid,name_cn,name_en', file=fout)
    in_dir = '../data/full_data'
    for fname in sorted(os.listdir(in_dir)):
        oid_fname = fname.split('.')[1]
        db_type = None
        if fname.startswith('hp.'):
            db_type = 'HP'
        elif fname.startswith('omim.'):
            db_type = 'OMIM'
        fname = in_dir + '/' + fname
        data = utils.load_json(fname, verbose=False)

        oid, name_cn, name_en = None, None, None
        if db_type == 'HP':
            for x in data['results']:
                if x['hpoId'] != 'HP:' + oid_fname:
                    continue
                oid = x['hpoId']
                name_cn = ('{};{}'.format(x['name_cn'], x['definition_cn'])
                           .replace(',',''))
                name_en = ('{};{}'.format(x['name_en'], x['definition_en'])
                           .replace(',',''))

        elif db_type == 'OMIM':
            for x in data['results']:
                if str(x['mimNumber']) != oid_fname:
                    continue
                oid = 'OMIM:' + str(x['mimNumber'])
                name_cn = x['cnTitle'].replace(',','')
                name_en = x['preTitle'].replace(',','')

        if oid:
            print(oid, name_cn, name_en, sep=',', file=fout)

    fout.close()
示例#12
0
def format_csv(timestamp):

    data_json = '../data/{}.subject.json'.format(timestamp)
    data_dir = '../data/{}'.format(timestamp)
    out_csv = '../data/{}.csv'.format(timestamp)
    data = utils.load_json(data_json, verbose=True)
    url_with_subj = dict()
    for subj in data:
        for url in data[subj]:
            url_with_subj[url] = subj

    utils.qprint('writing ' + out_csv)
    fout = open(out_csv, 'w')
    for fname in os.listdir(data_dir):
        journal = fname.split('/')[-1].replace('.html', '').lower()
        fname = join(data_dir, fname)
        jdata = utils.load_json(fname, verbose=False)
        if jdata['url'] in url_with_subj:
            subj = url_with_subj[jdata['url']]
        else:
            subj = 'unknown'
        print(subj,
              journal,
              jdata['title'].replace(',', ''),
              jdata['impact']['2016/2017']['ifact'],
              jdata['impact']['2015']['ifact'],
              jdata['impact']['2014']['ifact'],
              jdata['impact']['2016/2017']['npub'],
              jdata['impact']['2015']['npub'],
              jdata['impact']['2014']['npub'],
              jdata['title_abbrev'].replace(',', ''),
              jdata['issn'].replace(',', ''),
              jdata['url'],
              sep=',',
              file=fout)

    for subj in sorted(data):
        for url in sorted(data[subj]):
            journal = url.split('/')[-1].replace('.html', '').lower()
            fname = '{}/{}.json'.format(data_dir, journal)
    fout.close()
示例#13
0
文件: scrape.py 项目: zyhuang/ragno
def get_all_journal_links(nproc=3, nretry=10, tsleep=1):
    '''Get urls of all journals'''

    subject_list = [
        'biology',
        'medicine',
        'agriculture',
        'chemistry',
        'geoscience',
        'astronomy',
        'engineering',
        'management',
        'environmental',
        'math',
        'physics',
        'social',
    ]

    data = {}
    for subject in subject_list:
        npage, error = get_subject_npage(subject)
        args_list = []
        for page in range(1, npage + 1):
            url = ('http://www.bioxbio.com/if/subject/{}-{}.html'.format(
                subject, page))
            args_list.append([url, tsleep])
        tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry)
        data[subject] = {}
        for x in tmp:
            for k, v in x.items():
                data[subject][k] = v

        utils.qprint('get urls of {} journals of subject {}'.format(
            len(data[subject]), subject))

    return data
示例#14
0
def get_data(oid, out_json, proxy_set, tsleep):
    '''input Ontology ID, output in ../data
    '''

    if os.path.isfile(out_json):
        return None, None

    out_dir = abspath(dirname(out_json))
    os.makedirs(out_dir, exist_ok=True)

    proxies = None
    if len(proxy_set):
        proxy = random.choice(list(proxy_set))
        proxies = {'http': proxy}

    headers = {
        'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/39.0.2171.95 Safari/537.36'),
        'Authorization':
        token,
    }
    timeout_connect = 10
    timeout_read = 30
    timeout = (timeout_connect, timeout_read)

    # need to change
    url = None
    if oid.startswith('HP:'):
        url = host + '/hpo/?search={}&type=0&page=1'.format(oid)
    elif oid.startswith('OMIM:'):
        oid = oid.split(':')[1]
        url = host + '/omim/?search={}&type=1&page=1'.format(oid)
    else:
        raise 'input ID is not HP or OMIM'

    data = dict()
    error = None
    try:
        utils.qprint('get ' + url)
        r = requests.get(url,
                         headers=headers,
                         proxies=proxies,
                         timeout=timeout)
        time.sleep(tsleep)
        if not r.ok:
            raise 'can not get url: ' + url

        data = json.loads(r.text)
        if len(data) != 4:
            raise 'output json seems incorrect (missing keys)'

        utils.write_json(data, out_json)

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = dict()
        # print(error)
        # raise

    return None, error
示例#15
0
文件: scrape.py 项目: zyhuang/ragno
def get_journal_data(in_url, out_json, tsleep):
    '''Get journal data (name, issn, impact factors) from input URL
    http://www.bioxbio.com/if/html/{journal}.html
    and get URL of next/previous journals
    '''

    data = {'done': [], 'todo': []}
    error = None

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise
        time.sleep(tsleep)
        root = etree.HTML(r.text)

        if not os.path.isfile(out_json):

            jdata = {}
            jdata['url'] = in_url
            jdata['title'] = root.xpath('//div/h1')[0].text
            x = re.findall(r'<p>Journal Abbreviation: (.*?)<br>', r.text)
            jdata['title_abbrev'] = x[0] if x else x
            x = re.findall(r'Journal ISSN: (.*?)</p>', r.text)
            jdata['issn'] = x[0] if x else x

            jdata['impact'] = {}
            for tr in root.xpath('//table/tr'):
                td_list = tr.xpath('./td')
                if len(td_list) != 3:
                    continue
                year, ifact, npub = [x.text for x in td_list]
                if year == 'Year':
                    continue
                try:
                    ifact = float(ifact)
                except:
                    ifact = -1
                try:
                    npub = int(npub)
                except:
                    npub = -1
                jdata['impact'][year] = {
                    'ifact': ifact,
                    'npub': npub,
                }

            utils.write_json(jdata, out_json)

        data['done'].append(in_url)

        # get prev and next url
        a = root.xpath('//div[@class="col-md-6 col-sm-12 text-left"]/a')
        data['todo'].append('http://www.bioxbio.com/if/html/' +
                            a[0].get('href'))
        a = root.xpath('//div[@class="col-md-6 col-sm-12 text-right"]/a')
        data['todo'].append('http://www.bioxbio.com/if/html/' +
                            a[0].get('href'))

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = []

    return data, error