예제 #1
0
파일: fix.py 프로젝트: zyhuang/ragno
def fix_it(timestamp, book_chapter, content_err_list, content_fix_list):

    fix_dir = '../data/{}.fix'.format(timestamp)
    os.makedirs(fix_dir, exist_ok=True)
    fin_json = '../data/{}/{}.json'.format(timestamp, book_chapter)
    fold_json = '{}/{}.old.json'.format(fix_dir, book_chapter)
    fnew_json = '{}/{}.new.json'.format(fix_dir, book_chapter)

    data = utils.load_json(fin_json)
    new_content = []
    content_changed = False
    for i, x in enumerate(data['content']):
        has_issue = False
        for y, zz in zip(content_err_list, content_fix_list):
            if (x['vers'] == y['vers'] and x['text'] == y['text']):
                has_issue = True
                for z in zz:
                    new_content.append(z)
                content_changed = True
        if not has_issue:
            new_content.append(x)

    if not content_changed:
        print(':: target problem is not found in {}'.format(fin_json),
              file=sys.stderr,
              flush=True)
        return

    qcmd('cp {} {}'.format(fin_json, fold_json))
    data['content'] = new_content
    utils.write_json(data, fnew_json)
    qcmd('cp {} {}'.format(fnew_json, fin_json))
예제 #2
0
def get_chapter_text(in_url, out_json, tsleep):

    data = {}
    error = None

    if os.path.isfile(out_json):
        return data, error

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise Exception('can not get url ' + in_url)
        time.sleep(tsleep)
        text = etree.HTML(r.text)

        data['url'] = in_url

        pattern = (r'<span class="chapter_title"> (.*?) <i class="glyphicon '
                   'glyphicon-chevron-right "> </i> (.*?) <i class="glyphicon '
                   'glyphicon-chevron-right"> </i> (.*?)</span>')
        x = re.findall(pattern, r.text)
        if x:
            data['volume'] = x[0][0]
            data['book'] = x[0][1]
            data['chapter'] = x[0][2]
        else:
            data['volume'] = None
            data['book'] = None
            data['chapter'] = None

        phrase_set = set()
        data['content'] = []
        for x in text.xpath('//div[@id="bible_chapter_content"]/*'):
            t = {'vers': None, 'text': None}
            if x.tag == 'p':
                t = {'vers': x.get('value'), 'text': x.text.split('  ', 1)[-1]}
            else:
                t = {'vers': '', 'text': x.text}
            if t['vers'] == None and t['text'] == None:
                raise Exception(
                    'can not extract content from "{}" (url={})'.format(
                        x.text, in_url))
            # avoid duplicate entry
            phrase = '|{}|{}|'.format(t['vers'], t['text'])
            if phrase not in phrase_set:
                data['content'].append(t)
                phrase_set.add(phrase)

        utils.write_json(data, out_json)
        data = {}

    except Exception as e:
        error = '*ERROR* ' + str(e)
        raise
        data = {}

    return data, error
예제 #3
0
파일: scrape.py 프로젝트: zyhuang/ragno
def get_chapter_text(in_url, out_json, tsleep):

    data = {}
    error = None

    # if os.path.isfile(out_json):
    #     return data, error

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise Exception('can not get url ' + in_url)
        time.sleep(tsleep)
        text = etree.HTML(r.text)

        data['url'] = in_url
        x = text.xpath('//div[@class="libro"]')
        data['book'] = x[0].text if x else None

        x = text.xpath('//div[@id="capitolo"]')
        data['chapter'] = x[0].text.strip() if x else None

        data['content'] = []
        el = text.xpath('//div[@class="testidellibro"]')[0]
        s = html.unescape(etree.tostring(el).decode('utf-8'))
        for line in s.split('\n'):
            if not line.startswith('<sup>'):
                continue
            # x = re.findall(r'<sup>.*?<a .*?>(.*?)</a></sup>(.*?)$', line)
            x = re.findall(r'<a (.*?)>(.*?)</a></sup>(.*?)$', line)
            if not x:
                continue
            atag, vers, phrase = x[0]
            vers = re.findall(r'name="VER_(.*?)"', atag)[0]
            phrase = re.sub(r'<.*?>', ' ', phrase)
            phrase = re.sub(r'\t', ' ', phrase)
            phrase = re.sub(r' +', ' ', phrase)
            t = {'vers': vers, 'text': phrase.strip()}
            data['content'].append(t)

        utils.write_json(data, out_json)
        data = {}

    except Exception as e:
        error = '*ERROR* ' + str(e)
        raise
        data = {}

    return data, error
예제 #4
0
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10):

    data = []
    args_list = []
    for i in range(1, 74):
        url = '{}/index.php/?m=bible&template={}'.format(host, i)
        args_list.append([url, tsleep])
    tmp = utils.parallel_call(get_chapter_links,
                              args_list,
                              nproc=nproc,
                              nretry=nretry)
    for x in tmp:
        for y in x:
            data.append(y)

    utils.write_json(data, out_json)
    return data
예제 #5
0
def get_journal_data(in_url, out_json, tsleep):
    '''Get journal data (name, issn, impact factors) from input URL
    https://www.scijournal.org/impact-factor-of-ACM-SIGPLAN-NOTICES.shtml
    and get URL of next/previous journals
    '''

    data = {}
    error = None

    if os.path.isfile(out_json):
        return data, error

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise
        time.sleep(tsleep)
        root = etree.HTML(r.text)

        data = {}
        data['url'] = in_url
        x = root.xpath('//h1')[0].text
        data['title'] = x.replace('Impact Factor','').strip()

        x = re.findall(r'Journal Abbreviation: (.*?)<br>', r.text)
        data['title_abbrev'] = x[0] if x else None

        x = re.findall('Journal ISSN: (.*?)$', r.text, re.MULTILINE)
        data['issn'] = x[0] if x else None

        data['impact'] = {}
        years = ['2016/2017']
        for i in range(2008, 2016):
            years.append(str(i))
        for year in years:
            x = re.findall(r'{} Impact Factor : (.*?)<br>'.format(year), r.text)
            data['impact'][year] = x[0] if x else '-NA-'

        utils.write_json(data, out_json)

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = {}

    return data, error
예제 #6
0
파일: scrape.py 프로젝트: zyhuang/ragno
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10):

    data = []
    args_list = []
    for i in range(1, 74):
        url = ('{}/pls/labibbia_new/Bibbia_Utils.elenco_capitoli?'
               'origine=cei2008&idlibroz={}'.format(host, i))
        args_list.append([url, tsleep])
    tmp = utils.parallel_call(get_chapter_links,
                              args_list,
                              nproc=nproc,
                              nretry=nretry)
    for x in tmp:
        for y in x:
            data.append(y)

    utils.write_json(data, out_json)
    return data
예제 #7
0
def test():

    # data, error = get_journal_links(
    #     # 'https://www.scijournal.org/list-of-impact-factor-journal_Z.shtml',
    #     'https://www.scijournal.org/agriculture-and-forestry-journal-impact-factor-list.shtml',
    #     0)

    # url = 'https://www.scijournal.org/impact-factor-of-NAT-REV-CANCER.shtml'
    # url = 'https://www.scijournal.org/impact-factor-of-HEALTH-INFORM-J.shtml'
    # data, error = get_journal_data(
    #     url,
    #     # 'https://www.scijournal.org/impact-factor-of-NATURE.shtml',
    #     'foo.json', 0)
    # print(json.dumps(data, sort_keys=True, indent=2))
    # print(len(data))
    # print(error)

    data = get_all_journal_links(nproc=3, nretry=10, tsleep=1)
    utils.write_json(data, 'foo.json')
예제 #8
0
def get_data(oid, out_json, proxy_set, tsleep):
    '''input Ontology ID, output in ../data
    '''

    if os.path.isfile(out_json):
        return None, None

    out_dir = abspath(dirname(out_json))
    os.makedirs(out_dir, exist_ok=True)

    proxies = None
    if len(proxy_set):
        proxy = random.choice(list(proxy_set))
        proxies = {'http': proxy}

    headers = {
        'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/39.0.2171.95 Safari/537.36'),
        'Authorization':
        token,
    }
    timeout_connect = 10
    timeout_read = 30
    timeout = (timeout_connect, timeout_read)

    # need to change
    url = None
    if oid.startswith('HP:'):
        url = host + '/hpo/?search={}&type=0&page=1'.format(oid)
    elif oid.startswith('OMIM:'):
        oid = oid.split(':')[1]
        url = host + '/omim/?search={}&type=1&page=1'.format(oid)
    else:
        raise 'input ID is not HP or OMIM'

    data = dict()
    error = None
    try:
        utils.qprint('get ' + url)
        r = requests.get(url,
                         headers=headers,
                         proxies=proxies,
                         timeout=timeout)
        time.sleep(tsleep)
        if not r.ok:
            raise 'can not get url: ' + url

        data = json.loads(r.text)
        if len(data) != 4:
            raise 'output json seems incorrect (missing keys)'

        utils.write_json(data, out_json)

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = dict()
        # print(error)
        # raise

    return None, error
예제 #9
0
파일: scrape.py 프로젝트: zyhuang/ragno
def get_journal_data(in_url, out_json, tsleep):
    '''Get journal data (name, issn, impact factors) from input URL
    http://www.bioxbio.com/if/html/{journal}.html
    and get URL of next/previous journals
    '''

    data = {'done': [], 'todo': []}
    error = None

    try:
        utils.qprint('get ' + in_url)
        r = requests.get(in_url)
        if not r.ok:
            raise
        time.sleep(tsleep)
        root = etree.HTML(r.text)

        if not os.path.isfile(out_json):

            jdata = {}
            jdata['url'] = in_url
            jdata['title'] = root.xpath('//div/h1')[0].text
            x = re.findall(r'<p>Journal Abbreviation: (.*?)<br>', r.text)
            jdata['title_abbrev'] = x[0] if x else x
            x = re.findall(r'Journal ISSN: (.*?)</p>', r.text)
            jdata['issn'] = x[0] if x else x

            jdata['impact'] = {}
            for tr in root.xpath('//table/tr'):
                td_list = tr.xpath('./td')
                if len(td_list) != 3:
                    continue
                year, ifact, npub = [x.text for x in td_list]
                if year == 'Year':
                    continue
                try:
                    ifact = float(ifact)
                except:
                    ifact = -1
                try:
                    npub = int(npub)
                except:
                    npub = -1
                jdata['impact'][year] = {
                    'ifact': ifact,
                    'npub': npub,
                }

            utils.write_json(jdata, out_json)

        data['done'].append(in_url)

        # get prev and next url
        a = root.xpath('//div[@class="col-md-6 col-sm-12 text-left"]/a')
        data['todo'].append('http://www.bioxbio.com/if/html/' +
                            a[0].get('href'))
        a = root.xpath('//div[@class="col-md-6 col-sm-12 text-right"]/a')
        data['todo'].append('http://www.bioxbio.com/if/html/' +
                            a[0].get('href'))

    except Exception as e:
        error = '*ERROR* ' + str(e)
        data = []

    return data, error
예제 #10
0
파일: scrape.py 프로젝트: zyhuang/ragno
def test():

    data, error = get_journal_data(
        'http://www.bioxbio.com/if/html/Z-PADAGOGIK.html', 'foo.json', 0)
    print(data)
    print(error)


if __name__ == '__main__':

    # timestamp = '201804140057'
    # catch_missing(timestamp)
    # test()

    if len(sys.argv) != 2:
        timestamp = utils.get_timestamp()
    else:
        timestamp = sys.argv[1]

    out_dir = '../data/{}'.format(timestamp)
    subject_json = '../data/{}.subject.json'.format(timestamp)

    data = {}
    if os.path.isfile(subject_json):
        data = utils.load_json(subject_json)
    else:
        data = get_all_journal_links()
        utils.write_json(data, subject_json)

    get_all_journal_data(data, out_dir)
예제 #11
0
    # print(json.dumps(data, sort_keys=True, indent=2))
    # print(len(data))
    # print(error)

    data = get_all_journal_links(nproc=3, nretry=10, tsleep=1)
    utils.write_json(data, 'foo.json')




if __name__ == '__main__':

    # test()

    if len(sys.argv) != 2:
        timestamp = utils.get_timestamp()
    else:
        timestamp = sys.argv[1]

    out_dir = '../data/{}'.format(timestamp)
    url_json = '../data/{}.url.json'.format(timestamp)

    data = {}
    if os.path.isfile(url_json):
        data = utils.load_json(url_json)
    else:
        data = get_all_journal_links()
        utils.write_json(data, url_json)

    get_all_journal_data(data, out_dir)