def down_vol():
    sql_up = "update vol set stat = 1 where vol_url = %s"
    result = []
    while True:
        conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
        cur = conn.cursor()
        sql = "select pub_year,vol,num,vol_url from vol where stat = 0 limit 1000"
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for pub_year, vol, num, vol_url in rows:
                fdir = detail_path + '\\' + now_time + '\\' + pub_year + '\\' + vol
                if not os.path.exists(fdir):
                    os.makedirs(fdir)
                feature = "highwire-cite-title"
                res = utils.get_html(vol_url,
                                     feature=feature,
                                     proxies=proxy,
                                     timeout=30)
                if res:
                    fname = '%s/%s.html' % (fdir, num)
                    with open(fname, 'w', encoding='utf8') as f:
                        f.write(res.text)
                    utils.printf("下载%s年%s卷%s期成功" % (pub_year, vol, num))
                    result.append((vol_url))
                if utils.parse_results_to_sql(conn, sql_up, result, 50):
                    print("更新%s条成功" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn, sql_up, result)
            print("更新剩下%s条成功" % len(result))
            result.clear()
示例#2
0
def get_info_from_journal_list():
    sql = """
        insert ignore into journal_list (j_id,journal_url, journal_name, journal_issn) values(%s,%s,%s,%s)
    """
    result = []
    for root, dirs, files in os.walk(journal_list_path):
        conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
        for file in files:
            file_path = root + '/' + file
            print(file_path)
            with open(file_path, encoding='utf-8') as fp:
                text = fp.read()
            html = Selector(text, 'html')
            journal_url_list = html.xpath(
                "//table[@class='tabel-lab']//tr/td/a/@href").extract()
            for i, item in enumerate(journal_url_list):
                # http://www.sinomed.ac.cn/en/journalSearch.do?method=detail&id=27706&db=journal&dbtype=en
                journal_url = "http://www.sinomed.ac.cn/en/" + item
                j_id = re.findall("detail&id=(.*)&db=journal", item)[0]
                journal_name = html.xpath(
                    "//table[@class='tabel-lab']//tr/td/a/text()").extract()[i]
                journal_issn = html.xpath(
                    "//table[@class='tabel-lab']//tr/td[2]/span/text()"
                ).extract()[i].strip()
                result.append((j_id, journal_url, journal_name, journal_issn))
                if utils.parse_results_to_sql(conn, sql, result, 1000):
                    print("插入%s条成功" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn, sql, result)
            print("插入剩下%s条成功" % len(result))
            result.clear()
 def run(self):
     conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
     sql_up = "update detail set stat = 1 where url = %s"
     result = []
     while True:
         url = sql_queue.get()
         result.append((url))
         utils.printf(result)
         utils.parse_results_to_sql(conn, sql_up, result)
         print("更新%s条成功" % len(result))
         result.clear()
示例#4
0
 def down_cover(self):
     now_path = self.coverPath + '\\' + self.now_time
     if not os.path.exists(now_path):
         os.makedirs(now_path)
     sql_up = "update list set cover_stat = 1 where rawid = %s"
     result = []
     while True:
         sql = "select rawid, cover_url, title from list where cover_stat = 0 limit 1000"
         cur = self.conn.cursor()
         cur.execute(sql)
         rows = cur.fetchall()
         if len(rows) == 0:
             break
         else:
             for rawid, cover_url, title in rows:
                 path = now_path + '/%s.jpg' % rawid
                 if os.path.exists(path):
                     result.append(
                         (rawid)
                     )
                     print('%s exists' % path)
                 elif 'zhlx.png' in cover_url:
                     result.append(
                         (rawid)
                     )
                     print('%s no ' % path)
                 elif cover_url == 'http://lib.fifedu.com/upload_dir/':
                     result.append(
                         (rawid)
                     )
                     print('%s no ' % path)
                 else:
                     try:
                         res = requests.get(cover_url,headers=self.headers)
                         if res.status_code == 200:
                             path = now_path + '/%s.jpg' % rawid
                             # utils.Img2Jpg(res.content,path)
                             if utils.Img2Jpg(res.content,path):
                                 print('%s -- down cover right' % title)
                                 result.append(
                                     (rawid)
                                 )
                             else:
                                 print('%s -- down cover error' % title)
                         else:
                             print('status_code != 200 ~')
                     except Exception as e:
                         print(e)
                 if utils.parse_results_to_sql(self.conn, sql_up, result, 100):
                     print('插入%s条' % len(result))
                     result.clear()
             utils.parse_results_to_sql(self.conn, sql_up, result)
             print('插入剩下%s条' % len(result))
             result.clear()
示例#5
0
 def parse_detail(self):
     cover_list = []
     cover_now_path = self.coverPath + '\\' + self.now_time
     for root, dirs, files in os.walk(cover_now_path):
         for file in files:
             rawid = file.replace('.jpg','')
             cover_list.append(rawid)
     print(len(cover_list))
     conn = sqlite3.connect("video.db3")
     now_path = self.detailPath + '\\' + self.now_time
     sub_db_id = '203'
     sub_db = 'DMT'
     provider = 'fifeduvideo'
     type_ = '10'
     language = 'ZH'
     country = 'CN'
     date = '1900'
     date_created = '19000000'
     medium = '2'
     sql = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
     result = []
     for root, dirs, files in os.walk(now_path):
         for file in files:
             rawid = file.replace('.html','')
             Lngid = utils.GetLngid(sub_db_id, rawid)
             provider_url = provider + '@' + "http://lib.fifedu.com/toVideoPage.do?id=%s" % rawid
             provider_id = provider + '@' + rawid
             if rawid in cover_list:
                     cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg"
             else:
                 cover = ''
             batch = str(self.now_time) + '00'
             file_path = root + '/' + file
             print(file_path)
             with open(file_path, encoding='utf8') as f:
                 text = f.read()
             html = Selector(text, type='html')
             provider_subject = description = title = ''
             provider_subjects = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[1:-1]
             title = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[-1]
             description = html.xpath("//div[@class='tabvalue']/text()").extract_first('').strip()
             for item in provider_subjects:
                 provider_subject += item + ';'
                 provider_subject = provider_subject.replace('在线课程库;','').replace('玩转多语种;','').replace('视听练兵场;','')
             result.append(
                 (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium)
             )
             # if utils.parse_results_to_sql(conn, sql, result, 100):
             #     print("插入%s条" % len(result))
             #     result.clear()
         utils.parse_results_to_sql(conn, sql, result)
         print("插入剩下得%s条" % len(result))
         result.clear()
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'cambridgejournal')
     result = []
     issueresult = []
     stmt = 'insert ignore into article(uid,url,gch) Values(%s,%s,%s)'
     sql = 'insert ignore into issue(url,stat) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         gch = fullname.split('\\')[-2]
         soup = BeautifulSoup(text, 'lxml')
         ulTags = soup.select('ul.details')
         if len(ulTags) == 0:
             utils.logerror(fullname + '\n')
         for ulTag in ulTags:
             try:
                 aTag = ulTag.select_one('li.title > a')
                 if not aTag:
                     aTag = ulTag.select_one('li.title > h5 > a')
                 if aTag:
                     url = aTag.get('href')
                     uid = url.split('/')[-1]
                     result.append((uid, url, gch))
             except:
                 utils.printf(fullname)
                 utils.logerror(fullname)
                 break
         if filename.find('_') < 0:
             pageTag = soup.select_one('ul.pagination')
             if pageTag:
                 pTags = pageTag.select('li > a')
                 for pTag in pTags:
                     if operator.eq(pTag.get_text(), 'Last'):
                         pagenum = int(pTag.get('data-page-number'))
                         for page in range(2, pagenum + 1):
                             uri = '/core/journals/%s/issue/%s?pageNum=%s' % (
                                 gch, filename.replace('.html', ''), page)
                             issueresult.append((uri, 0))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     utils.parse_results_to_sql(conn, sql, issueresult)
     cnt += len(result)
     utils.printf(cnt)
     utils.printf('大于一页的个数为%s' % len(issueresult))
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
def get_journal_cover():
    sql_up = "update journal_list set stat_cover = 1 where journal_url = %s"
    result = []
    while True:
        conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
        cur = conn.cursor()
        sql = "select j_id, journal_url, journal_name, cover_url from journal_list where stat_cover = 0 limit 1000"
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for j_id, journal_url, journal_name, cover_url in rows:
                print(journal_name)
                if 'journalNameNull' in cover_url:
                    print("图片错误")
                    result.append(
                        (journal_url)
                    )
                elif cover_url == "":
                    print("无图片")
                    result.append(
                        (journal_url)
                    )
                else:
                    try:
                        res = requests.get(cover_url, headers=Headers,timeout=80)
                        if res.status_code == 200:
                            filename = '%s/%s.jpg' % (cover_path, j_id)
                            srcImg = Image.open(io.BytesIO(res.content))
                            dstImg = srcImg.resize((108, 150), Image.ANTIALIAS).convert('RGB')
                            dstImg.save(filename, 'JPEG')
                            print('下载%s-%s成功'%(journal_name,j_id))
                            provider = 'sinomedkpjournal'
                            pathtxt = r'E:\down_data_e\sinomed\kp\sinomedkpjournal_cover_20191028.txt'
                            s = provider + '@' + j_id + '★' + '/smartlib/' + provider + '/' + j_id + '.jpg' +'\n'
                            with open (pathtxt, 'a',encoding='utf-8') as f:
                                f.write(s)
                            result.append(
                                (journal_url)
                            )
                        else:
                            print("%s-的封面%s status_code !=200" % (journal_name, cover_url))
                    except Exception as e:
                        print(e)
                if utils.parse_results_to_sql(conn, sql_up, result, 100):
                    print("更新%s条成功" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn, sql_up, result)
            print("更新剩下%s条成功" % len(result))
            result.clear()
示例#8
0
def parse_detail():
    result = []
    conn_db3 = sqlite3.connect("zt_template.db3")
    sql_in = """
    insert into modify_title_info_zt (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
    """
    # 基本信息
    language = 'ZH'
    country = 'CN'
    type_ = '1'
    provider = '360elibbook'
    sub_db_id = '258'
    batch = now_time + '00'
    medium = "2"
    date = "1900"
    date_created = date + "0000"
    fdir = "%s\%s" % (detail_path,now_time)
    for _,dir_ in utils.file_list(fdir):
        utils.printf(dir_)
        pa = r"E:\\work\\360elib\\detail\\%s\\(.*)\\" % (now_time)
        provider_subject = re.findall(pa,dir_)[0]
        if provider_subject == 'None':
            provider_subject = ""
        with open(dir_,encoding='utf-8')as f:
            text = f.read()
        html = Selector(text,'html')
        rawid = _.replace(".html","")
        Lngid = utils.GetLngid(sub_db_id, rawid)
        provider_url = provider + '@' + "http://www.360elib.com:2100/chinese/web/Details.aspx?id=%s" % (rawid)
        provider_id = provider + '@' + rawid
        title = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_name']/text()").extract_first()
        creator = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_zz']/text()").extract_first("").replace(", ","").replace(",","").replace(",","").replace("、","")
        publisher = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_cbs']/text()").extract_first("")
        description = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_bookintro']/text()").extract_first("")
        cover_rawid = rawid.lower()
        cover_p = '%s/%s/%s.jpg' % (cover_path,now_time,cover_rawid)
        if os.path.exists(cover_p):
            cover = "/smartlib" + "/" + provider + "/" + cover_rawid + ".jpg"
        else:
            cover = ""
        result.append(
            (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher)
        )
        if utils.parse_results_to_sql(conn_db3, sql_in, result, 1000):
            utils.printf("插入%s条" % len(result))
            result.clear()
    utils.parse_results_to_sql(conn_db3, sql_in, result)
    utils.printf("插入剩下得%s条" % len(result))
    result.clear()
示例#9
0
 def down_detail(self):
     now_path = self.detailPath + '\\' + self.now_time
     if not os.path.exists(now_path):
         os.makedirs(now_path)
     sql_up = "update list set stat = 1 where rawid = %s"
     result = []
     while True:
         sql = "select rawid, url, title from list where stat = 0 limit 1000"
         cur = self.conn.cursor()
         cur.execute(sql)
         rows = cur.fetchall()
         if len(rows) == 0:
             break
         else:
             for rawid, url, title  in rows:
                 print(title)
                 print(url)
                 path = now_path + '/%s.html' % rawid
                 if os.path.exists(path):
                     result.append(
                         (rawid)
                     )
                     print('%s exists' % path)
                 else:
                     try:
                         res = requests.get(url,headers=self.headers,proxies=self.proxy)
                         fee = 'detailnavbody'
                         if res.status_code == 200:
                             res.encoding = res.apparent_encoding
                             if res.text.find(fee) > 0:
                                 path = now_path + '/%s.html' % rawid
                                 with open(path, mode='w', encoding='utf-8') as f:
                                     f.write(res.content.decode('utf8'))
                                 print('%s -- down right' % title)
                                 result.append(
                                     (rawid)
                                 )
                             else:
                                 print("not find fee ~")
                         else:
                             print('status_code != 200 ~')
                     except Exception as e:
                         print(e)
                 if utils.parse_results_to_sql(self.conn, sql_up, result, 100):
                     print('插入%s条' % len(result))
                     result.clear()
             utils.parse_results_to_sql(self.conn, sql_up, result)
             print('插入剩下%s条' % len(result))
             result.clear()
def parsel_detail_one():
    conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    conn_2 = sqlite3.connect('mirrorimutmeixingbook_20191218.db3')
    sub_db_id = '243'
    provider = 'mirrorimutmeixingbook'
    type = '1'
    date = '1900'
    date_created = '19000000'
    medium = '2'
    sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
    result_2 = []
    now_time = time.strftime('%Y%m%d')
    fdir = '%s/%s' % (detail_path, now_time)
    for _, filename in utils.file_list(fdir):
        rawid = _.replace(".html", "")
        with open(filename, encoding='gb18030') as f:
            text = f.read()
        html = Selector(text, "html")
        creator = html.xpath(
            "//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract(
            )[0].replace("作者:", "")
        if creator == "unknow":
            creator = ""
        if "ja" in rawid:
            id_ = rawid.replace('ja', '')
            url = "http://202.207.22.13:100/Soft_Showja.asp?SoftID=%s" % id_
            language = "JA"
            country = "JP"
            Lngid = utils.GetLngid(sub_db_id, rawid)
        else:
            language = "EN"
            country = "US"
            url = "http://202.207.22.13:100/Soft_Show.asp?SoftID=%s" % rawid
            Lngid = utils.GetLngid(sub_db_id, rawid)
        sql = "select title,provider_subject from detail where url = '%s'" % url
        cur = conn_1.cursor()
        cur.execute(sql)
        rows = cur.fetchall()
        title = rows[0][0].replace("\n", ' ')
        provider_subject = rows[0][1].replace("数字图书;", '')
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        batch = str(now_time) + '00'
        result_2.append((Lngid, rawid, provider, type, language, country,
                         provider_url, provider_id, batch, title, creator,
                         provider_subject, date, date_created, medium))
    utils.parse_results_to_sql(conn_2, sql_in, result_2)
    utils.printf("插入剩下的%s条" % len(result_2))
    result_2.clear()
示例#11
0
def parse_list():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    regex_bookid = re.compile(r"bookinfo.aspx\?id=(\d+)")
    stmt = 'insert ignore into book (bookid,stat) values(%s,%s)'
    results = []
    for _, filename in utils.file_list(list_path):
        with open(filename, encoding='gb18030') as f:
            text = f.read()
        bookidlist = regex_bookid.findall(text)
        for bookid in bookidlist:
            results.append((bookid,0))
        if utils.parse_results_to_sql(conn, stmt, results, 1000):
            total = len(results)
            results.clear()
            print('插入 ', total, ' 个结果到数据库成功')
    utils.parse_results_to_sql(conn, stmt, results)
    print('插入 ', len(results), ' 个结果到数据库成功')
def wirte_bigjson():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    cur = conn.cursor()
    big_json_filepath = big_json_path + '/' + '%s.big_json' % now_time
    sql_up = "update detail set stat = 1 where url = %s"
    result = []
    while True:
        down_date = now_time
        sql = "select url from detail where stat=0 limit 1000 "
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for url in rows:
                url = url[0]
                feature = "highwire-cite-metadata"
                res = utils.get_html(url,
                                     feature=feature,
                                     proxies=proxy,
                                     timeout=30)
                # res.encoding = 'utf-8'
                if res:
                    html = res.text.strip()
                    sumDict = dict()
                    sumDict['provider_url'] = url
                    sumDict['down_date'] = down_date
                    sumDict['htmlText'] = html
                    with open(big_json_filepath, mode='a',
                              encoding='utf-8') as f:
                        line = json.dumps(sumDict,
                                          ensure_ascii=False).strip() + '\n'
                        f.write(line)
                    utils.printf(url, 'write to big_json')
                    result.append((url))
                    if os.path.getsize(big_json_filepath) // (1024 * 1024 *
                                                              1024) >= 2:
                        big_json_filepath = big_json_path + '/' + '%s_%s.big_json' % (
                            str(now_time), random.randrange(111, 999))
                        print("文件大小到2G,切换文件名为%s" % big_json_filepath)
                if utils.parse_results_to_sql(conn, sql_up, result, 100):
                    print("更新%s条成功" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn, sql_up, result)
            print("更新剩下的%s条成功" % len(result))
            result.clear()
示例#13
0
def parselist():
    base_link = 'http://www.sinomed.ac.cn'
    now_time = datetime.datetime.now().strftime("%Y%m%d")
    file_path = path + '\\' + now_time
    sql = """insert ignore into detail(provider_url,title,author,pub_date,degree,organ,contributor) values(%s,%s,%s,%s,%s,%s,%s)"""
    result = []
    for root, dirs, files in os.walk(file_path):
        for file in files:
            file_name = root + '\\' + file
            # print(file_name)
            with open(file_name, encoding='utf-8') as fp:
                text = fp.read()
            html = Selector(text, 'html')
            div_all = html.xpath("//div[@class='right-wztxt fL']")
            for div in div_all:
                author = pud_date = degree = organ = contributor = ""
                # 标题
                title = div.xpath(".//h2/span/a/text()").extract_first('')
                count += 1
                # url
                provider_url = base_link + div.xpath(
                    ".//h2/span/a/@href").extract_first('')
                for p in div.xpath(".//p").xpath("string(.)").extract():
                    p = p.replace('\n', '').replace('\t', '').replace(' ', '')
                    # 作者
                    if p.startswith("研究生姓名:"):
                        author = p.replace("研究生姓名:", '')
                    # 出版时间
                    if p.startswith("出版时间:"):
                        pub_date = p.replace("出版时间:", '').replace("-", '')
                    # 授予学位
                    if p.startswith("授予学位:"):
                        degree = p.replace("授予学位:", '')
                    # 授予学位单位
                    if p.startswith("授予学位单位:"):
                        organ = p.replace("授予学位单位:", '')
                    # 导师
                    if p.startswith("导师:"):
                        contributor = p.replace("导师:", '')
                result.append((provider_url, title, author, pub_date, degree,
                               organ, contributor))
        if utils.parse_results_to_sql(conn, sql, result, 100):
            print("插入%s条成功" % len(result))
            result.clear()
    utils.parse_results_to_sql(conn, sql, result)
    print("全部插入结束")
 def process_cover(self, message):
     self.count = self.count + 1
     self.sqlList.append((1, message))
     if self.count % 2 == 1:
         utils.printf('%s:下载成功图片 %s 个' % (self.provider, self.count))
         conn = utils.init_db('mysql', 'ydylcninfo', 4)
         stmt = 'update book set cover_stat=%s where bookid=%s'
         utils.parse_results_to_sql(conn, stmt, self.sqlList)
         self.sqlList.clear()
         conn.close()
     if self.count % 100 == 0:
         self.refreshproxypool()
     if self.count == self.totalcount:
         conn = utils.init_db('mysql', 'ydylcninfo', 4)
         stmt = 'update book set cover_stat=%s where bookid=%s'
         utils.parse_results_to_sql(conn, stmt, self.sqlList)
         self.sqlList.clear()
         utils.printf('%s:下载图片完成' % self.provider)
示例#15
0
    def parse_detail(self):
        super().parse_detail()
        language = "EN"
        type = "1"
        medium = "2"
        provider = "cqjtukingbook"
        country = "US"
        batch = time.strftime('%Y%m%d') + "00"
        stmt = (
            '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,subject,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) 
                VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);''')
        conn = utils.init_db("sqlite3", self.template_file)
        results = []
        cnt = 0
        for file, fullpath in utils.file_list(self.detail_path):
            with open(fullpath, encoding='utf8') as fp:
                txt = fp.read()
            try:
                title, creator, publishers, date, identifier_pisbn, subject, description = self._parse_detail_one(
                    txt)
            except:
                exMsg = '* ' + traceback.format_exc()
                logerror(fullpath)
                logerror(exMsg)
                continue
            date_created = date + '0000'
            basename, _, ext = file.partition(".")
            rawid = basename
            provider_url = provider + "@http://123.56.143.23/kingbookwaiwen/book/info.aspx?id=" + rawid
            provider_id = provider + "@" + rawid
            lngID = "CQJTU_KINGBOOK_TS_" + rawid
            results.append(
                (lngID, rawid, title, creator, description, subject, date,
                 date_created, identifier_pisbn, language, country, provider,
                 provider_url, provider_id, type, medium, batch, publishers))
            if utils.parse_results_to_sql(conn, stmt, results, 1000):
                cnt += len(results)
                print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt))
                results.clear()

        utils.parse_results_to_sql(conn, stmt, results)
        cnt += len(results)
        print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt))
        conn.close()
示例#16
0
 def process_index(self, message):
     self.count = self.count + 1
     self.sqlList.append((1, message))
     if self.count % 20 == 1:
         utils.printf('%s:下载成功 %s 页' % (self.provider, self.count))
         conn = utils.init_db('mysql', 'bioonejournal')
         stmt = 'update issuelist set stat=%s where url=%s'
         utils.parse_results_to_sql(conn, stmt, self.sqlList)
         self.sqlList.clear()
         conn.close()
     if self.count % 100 == 0:
         self.refreshproxypool()
     if self.count == self.totalcount:
         conn = utils.init_db('mysql', 'bioonejournal')
         stmt = 'update issuelist set stat=%s where url=%s'
         utils.parse_results_to_sql(conn, stmt, self.sqlList)
         self.sqlList.clear()
         utils.printf('%s:down_index finish' % self.provider)
         self.sendwork('parse_index')
示例#17
0
def parse_index():
    result = []
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    sql = """
        insert ignore into catalog (g_name,g_link) values (%s,%s)
    """
    fname = "%s/index.html" % (index_path)
    with open(fname,encoding='utf-8')as f:
        text = f.read()
    html = Selector(text,'html')
    g_link_name_list = html.xpath("//div[@class='g-link']/a/text()").extract()
    g_link_list = html.xpath("//div[@class='g-link']/a/@href").extract()
    for num,g_link in enumerate(g_link_list):
        g_name = g_link_name_list[num]
        result.append(
            (g_name,g_link)
        )
    utils.parse_results_to_sql(conn,sql,result)
    utils.printf("插入%s条分类信息" % len(result))
示例#18
0
 def parse_list(self):
     super().parse_list()
     conn = utils.init_db('mysql', 'cqjtu_kingbook')
     base_url = "http://123.56.143.23/kingbookwaiwen/book/"
     regex_bookid = re.compile(r"info.aspx\?id=(\d+)")
     stmt = 'insert ignore into book (bookid,stat) values(%s,%s)'
     results = []
     for _, filename in utils.file_list(self.list_path):
         with open(filename, encoding='utf8') as f:
             text = f.read()
         bookidlist = regex_bookid.findall(text)
         for bookid in bookidlist:
             results.append((bookid, 0))
         if utils.parse_results_to_sql(conn, stmt, results, 1000):
             total = len(results)
             results.clear()
             print('插入 ', total, ' 个结果到数据库成功')
     utils.parse_results_to_sql(conn, stmt, results)
     print('插入 ', len(results), ' 个结果到数据库成功')
示例#19
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     result = []
     stmt = 'insert ignore into journal(journal_name,url,eissn,cover_url,active) Values(%s,%s,%s,%s,%s)'
     active = 0
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         if filename == 'active.html':
             active = 1
         else:
             active = 0
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         try:
             sel = Selector(text=text)
             for liTag in sel.xpath('//li[@class="search-item clearfix"]'):
                 journal_name = liTag.xpath(
                     './div/h4/a/text()').extract_first().strip()
                 url = liTag.xpath(
                     './div/h4/a/@href').extract_first().replace(
                         'journal', 'loi')
                 eissn = liTag.xpath(
                     './div/div/div/span[@class="meta__eissn"]/text()'
                 ).extract_first().replace('eISSN: ', '').strip()
                 cover_url = liTag.xpath(
                     './div/a/img/@src').extract_first().strip()
                 result.append(
                     (journal_name, url, eissn, cover_url, active))
             utils.printf(len(result))
         except:
             exMsg = '* ' + traceback.format_exc()
             print(exMsg)
             utils.logerror(exMsg)
             utils.logerror(fullname)
             return
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_index')
示例#20
0
def parse_detail():
    conn = sqlite3.connect('template.db3')
    language = "EN"
    type = "1"
    medium = "2"
    provider = "mirrorbeihuakingbook"
    country = "US"
    sub_db_id = "217"
    now_time = time.strftime('%Y%m%d')
    batch = now_time + "00"
    stmt = (
        '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) 
            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,);'''
        )
    results = []
    cnt=0
    dir_path = detail_path + '/' + now_time
    for file, fullpath in utils.file_list(dir_path):
        with open(fullpath, encoding='gb18030') as fp:
            txt = fp.read()
        title, creator, publishers, date, identifier_pisbn, description = _parse_detail_one(txt)
        # print(title, creator, publishers, date, identifier_pisbn, description)
        date_created = date + '0000'
        basename, _, ext = file.partition(".")
        rawid = basename
        provider_url = provider + "@http://10.5.23.18:8079/book/bookinfo.aspx?id=" + rawid
        provider_id = provider + "@" + rawid
        lngID = utils.GetLngid(sub_db_id, rawid)
        results.append(
            (
                lngID, rawid, title, creator, description, date, date_created, identifier_pisbn, language,
                country, provider, provider_url, provider_id, type, medium, batch, publishers
            )
        )
        if utils.parse_results_to_sql(conn, stmt, results, 1000):
                cnt+=len(results)
                print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt))
                results.clear()
    utils.parse_results_to_sql(conn, stmt, results)
    cnt+=len(results)
    print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt))
    conn.close()
示例#21
0
def parse_article_list():
    """
    每次更改时间变量 ,对应下载article_list_2_path文件夹
    """
    sql = """
        insert ignore into article_list(j_id,article_name, article_url) values(%s,%s, %s)
    """
    base_url = "http://www.sinomed.ac.cn"
    result = []
    # shijian = '20191031'
    # file_p = article_list_2_path + '\\' + shijian

    for root, dirs, files in os.walk(article_list_path):
        conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
        for file in files:
            try:
                file_path = root + '/' + file
                print(file_path)
                # j_id = re.findall(r'E:\\down_data_e\\sinomed\\en\\article_list_2\\20191031\\(.*)\\',file_path)[0]
                j_id = re.findall(
                    r'E:\\down_data_e\\sinomed\\en\\article_list\\(.*)\\',
                    file_path)[0]
                with open(file_path, encoding='utf-8') as fp:
                    text = fp.read()
                soup = BeautifulSoup(text, 'lxml')
                article_url_list = soup.select(
                    'div.right-wztxt.fL > h2 > div > a')
                for atag in article_url_list:
                    url = base_url + atag['href']
                    article_name = atag.text
                    result.append((j_id, article_name, url))
                    if utils.parse_results_to_sql(conn, sql, result, 1000):
                        print("插入详情页%s条" % len(result))
                        result.clear()
                utils.parse_results_to_sql(conn, sql, result)
                print("插入剩下详情页%s条" % len(result))
                result.clear()
            except Exception as e:
                print(e)
                line = file_path + '\t' + e + '\n'
                with open("log.txt", 'a', encoding='utf-8') as f:
                    f.write(line)
    def process_list(self, message):
        self.count = self.count + 1
        self.sqlList.append(message)

        if self.count % 40 == 1:
            utils.printf('%s:下载成功 %s 页' % (self.provider, self.count))
            conn = utils.init_db('mysql', 'ydylcnbook', 4)
            stmt = 'update book set stat=%s where bookid=%s'
            utils.parse_results_to_sql(conn, stmt, self.sqlList)
            self.sqlList.clear()
            conn.close()
        if self.count % 100 == 0:
            self.refreshproxypool()
        if self.count == self.totalcount:
            conn = utils.init_db('mysql', 'ydylcnbook', 4)
            stmt = 'update book set stat=%s where bookid=%s'
            utils.parse_results_to_sql(conn, stmt, self.sqlList)
            self.sqlList.clear()
            utils.printf('downloadlist finish')
            self.senddistributefinish('startdown_cover')
示例#23
0
    def process_index(self, message):
        self.count = self.count + 1
        self.sqlList.append(message)

        if self.count % 40 == 1:
            utils.printf('%s:下载成功 %s 页' % (self.provider, self.count))
            conn = utils.init_db('mysql', 'pishuinfo', 4)
            stmt = 'update video set stat=%s where video_id=%s'
            utils.parse_results_to_sql(conn, stmt, self.sqlList)
            self.sqlList.clear()
            conn.close()
        if self.count % 100 == 0:
            self.refreshproxypool()
        if self.count == self.totalcount:
            conn = utils.init_db('mysql', 'pishuinfo', 4)
            stmt = 'update video set stat=%s where video_id=%s'
            utils.parse_results_to_sql(conn, stmt, self.sqlList)
            self.sqlList.clear()
            utils.printf('downloadindex finish')
            self.sendwork('parse_index')
示例#24
0
 def parse_html(self,message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'pishuinfo', 4)
     result = []
     stmt = 'insert ignore into video(video_id,stat) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         sel = Selector(text=text)
         for href in sel.xpath('//*[@id="TitleList"]/div/a/@href'):
             video_id = href.re('.*ID=(\d+)&isHost=.*')[0]
             result.append((video_id, 0))
             utils.printf(len(result))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_index')                  
def xiugai():
    sql_uu = "update journal_info set j_id = %s where journal_name = %s"
    result = []
    while True:
        conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
        cur = conn.cursor()
        sql = "select j_id,journal_name from journal_list"
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for j_id, journal_name in rows:
                result.append((j_id, journal_name))
                if utils.parse_results_to_sql(conn, sql_uu, result, 1000):
                    print("更新%s条" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn, sql_uu, result)
            print("更新剩下的%s条" % len(result))
            result.clear()
    def process_list(self, message):
        self.count = self.count + 1
        self.sqlList.append(message)

        if self.count % 40 == 1:
            utils.printf('%s:下载成功 %s 页' % (self.provider, self.count))
            conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
            stmt = 'update issue set stat=%s where journal_id=%s and issue_id=%s'
            utils.parse_results_to_sql(conn, stmt, self.sqlList)
            self.sqlList.clear()
            conn.close()
        if self.count % 100 == 0:
            self.refreshproxypool()
        if self.count == self.totalcount:
            conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
            stmt = 'update issue set stat=%s where journal_id=%s and issue_id=%s'
            utils.parse_results_to_sql(conn, stmt, self.sqlList)
            self.sqlList.clear()
            utils.printf('downloadlist finish')
            self.sendwork('parse_list')
示例#27
0
def down_detail():
    result = []
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    cur = conn.cursor()
    sql = "select provider_subject,url from detail where stat = 0 limit 1000"
    sql_up = "update detail set stat = 1 where url = %s"
    while True:
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for provider_subject,url in rows:
                fdir = "%s/%s/%s" % (detail_path,now_time,provider_subject)
                if not os.path.exists(fdir):
                    os.makedirs(fdir)
                rawid = url.replace("/chinese/web/Details.aspx?id=","")
                fname = "%s/%s.html" % (fdir,rawid)
                if os.path.exists(fname):
                    utils.printf("%s 存在" % rawid)
                    result.append(
                        (url)
                    )
                    continue
                feature = "ctl00_ContentPlaceHolder1_lb_name"
                all_url = "http://www.360elib.com:2100" + url
                res = utils.get_html(all_url,feature=feature)
                if res:
                    # print(res.apparent_encoding)
                    with open(fname,'w',encoding='utf-8')as f:
                        f.write(res.content.decode('gb18030'))
                    utils.printf("下载%s 成功" % rawid)
                    result.append(
                        (url)
                    )
                if utils.parse_results_to_sql(conn,sql_up,result,1000):
                    utils.printf("更新%s条" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn,sql_up,result)
            utils.printf("更新剩下%s条" % len(result))
            result.clear()
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'ydylcnbook', 4)
     result = []
     stmt = 'insert ignore into book(bookid,cover_url) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         sel = Selector(text=text)
         for aTag in sel.xpath('//ul[@class="list-book-1"]/li/a'):
             bookid = aTag.xpath('./@href').extract_first().split('=')[-1]
             cover_url = aTag.xpath('./div/div/img/@src').extract_first()
             result.append((bookid, cover_url))
             utils.printf(len(result))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_list')
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'cambridgejournal')
     result = []
     stmt = 'insert ignore into journal(url,cover_url) Values(%s,%s)'
     cnt = 0
     fname = self.html_path + '/html.html'
     with open(fname, encoding='utf8') as f:
         text = f.read()
     soup = BeautifulSoup(text, 'lxml')
     aTaglist = soup.select('ul.listings > li > div > div > a')
     for aTag in aTaglist:
         url = aTag.get('href')
         cover_url = ''
         result.append((url, cover_url))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.sendwork('startdown_index')
def parse_index():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    result = []
    sql_in = "insert ignore into list(provider_subject,url) values (%s,%s)"
    for _, filedir in utils.file_list(index_path):
        with open(filedir, mode='r', encoding='gb18030') as f:
            text = f.read()
        html = Selector(text, 'html')
        big_subject = html.xpath(
            "//table[@class='txt_css']//td[2]/a[2]/text()").extract_first()
        list_urls = html.xpath(
            "//table[@class='title_main']//td[@class='title_maintxt'][1]//a/@href"
        ).extract()
        for i, item in enumerate(list_urls):
            provider_subject = big_subject + ";" + html.xpath(
                "//table[@class='title_main']//td[@class='title_maintxt'][1]//a/@title"
            ).extract()[i]
            url = "http://202.207.22.13:100/" + item
            result.append((provider_subject, url))
        utils.parse_results_to_sql(conn, sql_in, result)
        print('插入', len(result), ' 个结果到数据库成功')