def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'science') result = [] stmt = 'insert ignore into issue(url,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.index_path): urlf = '{}.sciencemag.org'.format(filename.split('_')[0]) with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') divTags = soup.find_all( 'div', class_= 'highwire-cite highwire-cite-highwire-issue highwire-citation-jnl-sci-issue-archive clearfix' ) for divTag in divTags: url = urlf + divTag.a.get('href') result.append((url, 0)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析索引页完成...' % self.provider) self.senddistributefinish('startdown_list') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def parse_detal(): for file, fullpath in utils.file_list(detailpath): j_id = file.replace(".html", '') with open(fullpath, encoding='utf8') as f: text = f.read() html = Selector(text, 'html') title = html.xpath("//h3/text()").extract_first("") title_en = html.xpath("//h4/text()").extract_first("").replace( "'", "''") div = html.xpath("//div[@class='perinfo']/text()").extract() zbdw = dq = issn = cn = shijian = "" for item in div: if item.startswith("主办单位:"): zbdw = item.replace("主办单位:", "") if item.startswith("地区:"): dq = item.replace("地区:", "") if item.startswith("国际刊号:"): issn = item.replace("国际刊号:", "") if item.startswith("国内刊号:"): cn = item.replace('国内刊号:', '') if item.startswith("出版周期:"): shijian = item.replace("出版周期:", "") # utils.printf(title,title_en,zbdw,dq,issn,cn,shijian) sql = "update journal set 期刊名称_外文 = '%s' , 主办单位 = '%s' , 地区 = '%s' , 国际刊号 = '%s' , 国内刊号 = '%s' , 出版周期 = '%s' where 期刊id = '%s'" % ( title_en, zbdw, dq, issn, cn, shijian, j_id) curser = db.cursor() curser.execute(sql) curser.commit() utils.printf("更新%s信息成功" % title)
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'ascebook') result = [] stmt = 'insert ignore into book(url,cover_url) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') divlist = soup.select( '#frmSearchResults > div > div.listBody > div > div.leftSide') for divTag in divlist: url = divTag.a.get('href') isbn = url.split('/')[-1] cover_url = '' if not isbn.startswith('978'): continue coverTag = divTag.a.select_one('img') if coverTag: cover_url = coverTag.get('src') result.append((url, cover_url)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def parse_vol(): fdir = detail_path + '\\' + now_time sql = """ insert ignore into detail (url) values(%s) """ result = [] conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) for _, fname in utils.file_list(fdir): x = fname.replace(fdir, '').replace(_, '') a = r'\\(.*)\\\d{4}\\\d+\\' journal_name = re.findall(a, x)[0] utils.printf(journal_name) with open(fname, encoding='utf-8') as fp: text = fp.read() html = Selector(text, 'html') url_list = html.xpath( "//div[@class='highwire-cite highwire-cite-highwire-article highwire-citation-jnl-jpet-list-complete clearfix']/a[@class='highwire-cite-linked-title']/@href" ).extract() for i, item in enumerate(url_list): base_url = dic_journal[journal_name].replace( "/content/by/year", "") url = base_url + item result.append((url)) utils.parse_results_to_sql(conn, sql, result) print("插入剩下%s条成功" % len(result)) result.clear()
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) result = [] stmt = 'insert ignore into article(article_id,journal_id) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() journal_id = fullname.split('\\')[-2] dicitem = json.loads(text, encoding='utf-8')['resultValue'] for lanmu in dicitem.keys(): for fenlei in dicitem[lanmu].keys(): for dicdetail in dicitem[lanmu][fenlei]: article_id = dicdetail['id'] result.append((article_id, journal_id)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail') self.sendwork('down_cover')
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'hepjournal', 4) result = [] stmt = 'insert ignore into article(url,journal_id) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() journal_id = filename.split('_')[0] sel = Selector(text=text) for aTag in sel.xpath('//a[@class="txt_biaoti"]'): url = aTag.xpath('./@href').extract_first() result.append((url, journal_id)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail') self.sendwork('down_cover')
def parse_detail_meta(self, message): conn = utils.init_db('mysql', 'aiaajournal', 2) cur = conn.cursor() cur.execute( 'select gch,journal_name,journal_name_en,pissn,eissn from journal') rows = cur.fetchall() for gch, journal_name, journal_name_en, pissn, eissn in rows: self.dic[gch] = (journal_name, journal_name_en, pissn, eissn) cur.close() conn.close() self.predb3('base_obj_meta_a_template_qk.db3', 'base_obj_meta_a_qk.aiaajournal') self.sqlList.clear() stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date, vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid, rawid,product,sub_db, provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract, abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'meta') if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_indexlist(self, message): try: utils.printf('%s:解析期索引页开始...' % self.provider) conn = utils.init_db('mysql', 'bioonejournal') self.sqlList.clear() cnt = 0 cur = conn.cursor() path = '%s/%s' % (self.datepath, 'indexlist') for filename, fullname in utils.file_list(path): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') aTags = soup.find_all('a', class_='IssueByYearInnerText') for aTag in aTags: url = aTag.get('href').replace('https://bioone.org', '') self.sqlList.append( "insert ignore into issuelist(url,year) Values('%s','%s')" % (url, url.split('/')[-1])) cnt += len(self.sqlList) for sql in self.sqlList: cur.execute(sql) conn.commit() self.sqlList.clear() utils.printf(cnt) cur.close() conn.close() utils.printf('%s:解析索引页完成...' % self.provider) # self.sendwork('down_cover') self.senddistributefinish('startdown_index') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def parse_detail(self, message): conn = utils.init_db('mysql', 'hepjournal', 4) cur = conn.cursor() cur.execute( 'select journal_id,journal_name,issn,eissn,cnno from journal') rows = cur.fetchall() for journal_id, journal_name, issn, eissn, cnno in rows: self.dic[journal_id] = (journal_name, issn, eissn, cnno) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage, endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn, identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium, batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_index(self, message): workdir = message try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'apsjournal') result = [] stmt = 'insert ignore into issue(url,year) Values(%s,%s) on DUPLICATE key UPDATE year=%s' cnt = 0 for filename, fullname in utils.file_list(workdir): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') liTags = soup.select('div.volume-issue-list > ul > li') for liTag in liTags: yeartext = liTag.get_text().strip() year = re.sub('.*?(\d{4}) \(.*?\)', r'\1', yeartext) url = 'https://journals.aps.org' + liTag.b.a.get('href') result.append((url, year, year)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析索引页完成...' % self.provider) self.senddistributefinish('startdown_list') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def parse_html(self): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) result = [] stmt = 'insert ignore into journal(journal_id,journal_name,cover_url) Values(%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() try: dic = json.loads(text, encoding='utf8') for dicitem in dic['resultValue']: dicitem = json.loads(dicitem, encoding='utf8') gch = dicitem['id'] name = dicitem['name'] cover_url = dicitem['volumeImg'] if cover_url == '': cover_url = dicitem['journalImg'] print(gch, name, cover_url) result.append((gch, name, cover_url)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_index')
def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) self.sqlList.clear() cur = conn.cursor() for filename, fullname in utils.file_list(self.index_path): with open(fullname, encoding='utf8') as f: text = f.read() dic = json.loads(text, encoding='utf-8') gch = filename.replace('.json', '') dicitem = dic['resultValue'] issn = dicitem['issnNm'] cnno = dicitem['cnNm'] sql = 'update journal set issn="%s",cnno="%s" where journal_id="%s"' % ( issn, cnno, gch) cur.execute(sql) conn.commit() cur.close() conn.close() utils.printf('%s:解析索引页完成...' % self.provider) # self.sendwork('down_cover') self.senddistributefinish('get_issuelist') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaabook', 2) result = [] stmt = 'insert ignore into book(book_name,url,pub_year,cover_url) Values(%s,%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() try: sel = Selector(text=text) for liTag in sel.xpath('//li[@class="search-item clearfix"]'): book_name = liTag.xpath( './div/h4/a/text()').extract_first().strip() url = liTag.xpath('./div/h4/a/@href').extract_first() pub_year = liTag.xpath( './/div[@class="search-item__data-group__field meta__date"]/text()' ).extract_first() cover_url = liTag.xpath( './div/a/img/@src').extract_first().strip() result.append((book_name, url, pub_year, cover_url)) utils.printf(len(result)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def parse_detail(self, message): conn = utils.init_db('mysql', 'aiaabook', 2) cur = conn.cursor() cur.execute('select url,pub_year from book') rows = cur.fetchall() for url, pub_year in rows: doi = '10.2514/' + url.split('/')[-1] self.dic[doi] = (pub_year) cur.close() conn.close() self.predb3() self.sqlList.clear() stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn, identifier_eisbn, description, publisher,cover,title_series, date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id, type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ count = 0 for filename, fullname in utils.file_list(self.detail_path): onemessage = self.parse_detail_one(filename, fullname, 'zt') # print(onemessage) if onemessage: self.sqlList.append(onemessage) if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50): count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.sqlList.clear() utils.parse_results_to_sql(self.conn, stmt, self.sqlList) count += len(self.sqlList) utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count)) self.conn.close() self.conn = None utils.msg2weixin('%s: 解析完成,成品文件为%s' % (self.provider, self.template_file))
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'bioonejournal') result = [] stmt = 'insert ignore into journal(url,cover_url) Values(%s,%s) on DUPLICATE key UPDATE cover_url=%s' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') aTaglist = soup.select('div.journal.BrowseTitleAll > a') for aTag in aTaglist: url = aTag.get('href') if url == "/journals/": continue if url.startswith('/ebooks'): continue cover_url = aTag.img.get('src') result.append((url, cover_url, cover_url)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) result.clear() utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_indexlist')
def parse_list(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) result = [] sql_in = "insert ignore into detail(provider_subject,title,url,add_time,look_time) values (%s,%s,%s,%s,%s)" for _, filedir in utils.file_list(list_path): # E:\work\美星外文\list\日文图书;随笔\2.html utils.printf(filedir) regex = r"E:\\work\\美星外文\\list\\(.*?)\\" provider_subject = re.findall(regex, filedir)[0] with open(filedir, mode='r', encoding='gb18030') as f: text = f.read() html = Selector(text, 'html') list_urls = html.xpath( "//tr[@class='tdbg_leftall']/td/strong/a/@href").extract() for i, item in enumerate(list_urls): title = html.xpath("//tr[@class='tdbg_leftall']/td/strong/a/text()" ).extract()[i].split(" ")[0] url = "http://202.207.22.13:100/" + item add_time = html.xpath( "//tr[@class='tdbg_leftall']/td[3]/text()").extract()[i] look_time = html.xpath( "//tr[@class='tdbg_leftall']/td[4]/text()").extract()[i] result.append((provider_subject, title, url, add_time, look_time)) utils.parse_results_to_sql(conn, sql_in, result) print('插入', len(result), ' 个结果到数据库成功') result.clear()
def parse_year(): fdir = year_path + '/' + now_time sql = """ insert ignore into vol (journal_name,pub_year,vol,num,vol_url) values(%s,%s,%s,%s,%s) """ result = [] conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) for _, fname in utils.file_list(fdir): utils.printf(fname) journal_name = fname.replace(fdir, "").replace(_, "").replace("\\", "") with open(fname, encoding='utf-8') as fp: text = fp.read() html = Selector(text, 'html') vol_urls = html.xpath( "//a[@class='hw-issue-meta-data']/@href").extract() for i, item in enumerate(vol_urls): base_url = dic_journal[journal_name].replace( "/content/by/year", "") vol_url = base_url + item pub_year = _.replace(".html", "") vol = html.xpath( "//a[@class='hw-issue-meta-data']/span[2]/text()").extract()[i] vol = re.findall("(\d+)", vol)[0] num = html.xpath( "//a[@class='hw-issue-meta-data']/span[3]/text()").extract()[i] num = re.findall("Issue\s+(.*)", num)[0].replace(" ", "_") result.append((journal_name, pub_year, vol, num, vol_url)) if utils.parse_results_to_sql(conn, sql, result, 100): print("插入%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql, result) print("插入剩下%s条成功" % len(result)) result.clear()
def parse_list(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) regex_videoid = re.compile(r"/Course/Detail/(\d+)") stmt = 'insert ignore into video (rawid,detail_url,cover_url,provider_subject) values(%s,%s,%s,%s)' results = [] for _, filename in utils.file_list(list_path): provider_subject = filename.replace('E:\work\\fzwjt\list\\', '').replace('\\' + _, '') # print(provider_subject) with open(filename, encoding='utf8') as f: text = f.read() html = Selector(text, 'html') detail_url_list = html.xpath( "//ul[@class='videoUl']/li/a/@href").extract() cover_url_list = html.xpath( "//ul[@class='videoUl']/li/a/img/@src").extract() J_detail_url_list = html.xpath( "//ul[@class='videoUl']/li[@class='J']/a/@href").extract() for x in J_detail_url_list: detail_url_list.remove(x) for i, item in enumerate(detail_url_list): detail_url = 'http://www.fzwjt.com' + item rawid = regex_videoid.findall(item) cover_url = cover_url_list[i] results.append((rawid, detail_url, cover_url, provider_subject)) if utils.parse_results_to_sql(conn, stmt, results, 1000): total = len(results) results.clear() print('插入 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, stmt, results) print('插入 ', len(results), ' 个结果到数据库成功')
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'apsjournal') result = [] stmt = 'insert ignore into article(url,vol,issue) Values(%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() vol = filename.split('_')[0] issue = filename.split('_')[-1].replace('.html', '') soup = BeautifulSoup(text, 'lxml') aTags = soup.select('div.large-9.columns > h5 > a') for aTag in aTags: url = aTag.get('href') if not url.startswith('/'): continue url = 'https://journals.aps.org' + url result.append((url, vol, issue)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def parse_list(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaajournal', 2) result = [] stmt = 'insert ignore into article(id,url,vol,stat,failcount) Values(%s,%s,%s,%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for href in sel.xpath('//h5[@class="issue-item__title"]/a/@href'): url = href.extract().replace('/doi/', '/doi/abs/').strip() id = fullname.split('\\')[-2] + '_' + url.split('/')[-1] vol = filename.split('_')[0] print(id, url) result.append((id, url, vol, 0, 0)) if utils.parse_results_to_sql(conn, stmt, result, 200): cnt += len(result) utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt)) result.clear() cnt += len(result) utils.parse_results_to_sql(conn, stmt, result) utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt)) utils.printf('%s解析列表页完成' % self.provider) self.senddistributefinish('startdown_detail')
def parse_index(self, message): try: utils.printf('%s:解析索引页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaajournal', 2) result = [] stmt = 'insert ignore into issue(url,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.index_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for aTag in sel.xpath('//a[@class="loi__issue__vol"]'): url = aTag.xpath('./@href').extract_first() if url.endswith('/0/0'): continue result.append(('https://arc.aiaa.org' + url, 0)) if utils.parse_results_to_sql(conn, stmt, result, 200): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析索引页完成...' % self.provider) self.senddistributefinish('startdown_list') except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg)
def main(): a = get_args() tempdir = os.path.join(a.out_dir, 'a') os.makedirs(tempdir, exist_ok=True) ptfiles = file_list(a.in_dir, 'pt') ptest = torch.load(ptfiles[0]) if isinstance(ptest, list): ptest = ptest[0] shape = [*ptest.shape[:3], (ptest.shape[3] - 1) * 2] vsteps = int(a.length * 25 / len(ptfiles)) if a.steps is None else a.steps # 25 fps pbar = ProgressBar(vsteps * len(ptfiles)) for px in range(len(ptfiles)): params1 = read_pt(ptfiles[px]) params2 = read_pt(ptfiles[(px + 1) % len(ptfiles)]) params, image_f, _ = fft_image(shape, resume=params1) image_f = to_valid_rgb(image_f) for i in range(vsteps): with torch.no_grad(): img = image_f( (params2 - params1) * math.sin(1.5708 * i / vsteps)**2)[0].permute(1, 2, 0) img = torch.clip(img * 255, 0, 255).cpu().numpy().astype(np.uint8) imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img) if a.verbose is True: cvshow(img) pbar.upd() os.system('ffmpeg -v warning -y -i %s/\%%05d.jpg "%s-pts.mp4"' % (tempdir, a.in_dir))
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'cambridgejournal') result = [] issueresult = [] stmt = 'insert ignore into article(uid,url,gch) Values(%s,%s,%s)' sql = 'insert ignore into issue(url,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() gch = fullname.split('\\')[-2] soup = BeautifulSoup(text, 'lxml') ulTags = soup.select('ul.details') if len(ulTags) == 0: utils.logerror(fullname + '\n') for ulTag in ulTags: try: aTag = ulTag.select_one('li.title > a') if not aTag: aTag = ulTag.select_one('li.title > h5 > a') if aTag: url = aTag.get('href') uid = url.split('/')[-1] result.append((uid, url, gch)) except: utils.printf(fullname) utils.logerror(fullname) break if filename.find('_') < 0: pageTag = soup.select_one('ul.pagination') if pageTag: pTags = pageTag.select('li > a') for pTag in pTags: if operator.eq(pTag.get_text(), 'Last'): pagenum = int(pTag.get('data-page-number')) for page in range(2, pagenum + 1): uri = '/core/journals/%s/issue/%s?pageNum=%s' % ( gch, filename.replace('.html', ''), page) issueresult.append((uri, 0)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) utils.parse_results_to_sql(conn, sql, issueresult) cnt += len(result) utils.printf(cnt) utils.printf('大于一页的个数为%s' % len(issueresult)) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def parsel_detail_one(): conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) conn_2 = sqlite3.connect('mirrorimutmeixingbook_20191218.db3') sub_db_id = '243' provider = 'mirrorimutmeixingbook' type = '1' date = '1900' date_created = '19000000' medium = '2' sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" result_2 = [] now_time = time.strftime('%Y%m%d') fdir = '%s/%s' % (detail_path, now_time) for _, filename in utils.file_list(fdir): rawid = _.replace(".html", "") with open(filename, encoding='gb18030') as f: text = f.read() html = Selector(text, "html") creator = html.xpath( "//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract( )[0].replace("作者:", "") if creator == "unknow": creator = "" if "ja" in rawid: id_ = rawid.replace('ja', '') url = "http://202.207.22.13:100/Soft_Showja.asp?SoftID=%s" % id_ language = "JA" country = "JP" Lngid = utils.GetLngid(sub_db_id, rawid) else: language = "EN" country = "US" url = "http://202.207.22.13:100/Soft_Show.asp?SoftID=%s" % rawid Lngid = utils.GetLngid(sub_db_id, rawid) sql = "select title,provider_subject from detail where url = '%s'" % url cur = conn_1.cursor() cur.execute(sql) rows = cur.fetchall() title = rows[0][0].replace("\n", ' ') provider_subject = rows[0][1].replace("数字图书;", '') provider_url = provider + '@' + url provider_id = provider + '@' + rawid batch = str(now_time) + '00' result_2.append((Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium)) utils.parse_results_to_sql(conn_2, sql_in, result_2) utils.printf("插入剩下的%s条" % len(result_2)) result_2.clear()
def parse_detail(): result = [] conn_db3 = sqlite3.connect("zt_template.db3") sql_in = """ insert into modify_title_info_zt (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ # 基本信息 language = 'ZH' country = 'CN' type_ = '1' provider = '360elibbook' sub_db_id = '258' batch = now_time + '00' medium = "2" date = "1900" date_created = date + "0000" fdir = "%s\%s" % (detail_path,now_time) for _,dir_ in utils.file_list(fdir): utils.printf(dir_) pa = r"E:\\work\\360elib\\detail\\%s\\(.*)\\" % (now_time) provider_subject = re.findall(pa,dir_)[0] if provider_subject == 'None': provider_subject = "" with open(dir_,encoding='utf-8')as f: text = f.read() html = Selector(text,'html') rawid = _.replace(".html","") Lngid = utils.GetLngid(sub_db_id, rawid) provider_url = provider + '@' + "http://www.360elib.com:2100/chinese/web/Details.aspx?id=%s" % (rawid) provider_id = provider + '@' + rawid title = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_name']/text()").extract_first() creator = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_zz']/text()").extract_first("").replace(", ","").replace(",","").replace(",","").replace("、","") publisher = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_cbs']/text()").extract_first("") description = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_bookintro']/text()").extract_first("") cover_rawid = rawid.lower() cover_p = '%s/%s/%s.jpg' % (cover_path,now_time,cover_rawid) if os.path.exists(cover_p): cover = "/smartlib" + "/" + provider + "/" + cover_rawid + ".jpg" else: cover = "" result.append( (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) ) if utils.parse_results_to_sql(conn_db3, sql_in, result, 1000): utils.printf("插入%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn_db3, sql_in, result) utils.printf("插入剩下得%s条" % len(result)) result.clear()
def getpageinfo(self): if not self.list_path: self.initpath() for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() volume = filename.replace('.html', '').split('_')[0] issue = filename.replace('.html', '').split('_')[-1] sel = Selector(text=text) for divTag in sel.xpath('//div[@class="issue-item"]'): doi = '10.2514/' + divTag.xpath( './h5/a/@href').extract_first().split('/')[-1] page = divTag.xpath('./div[@class="issue-item__pages"]/text()' ).extract_first().replace('–', '-') self.dic[doi] = (page, volume, issue)
def parse_list(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) regex_bookid = re.compile(r"bookinfo.aspx\?id=(\d+)") stmt = 'insert ignore into book (bookid,stat) values(%s,%s)' results = [] for _, filename in utils.file_list(list_path): with open(filename, encoding='gb18030') as f: text = f.read() bookidlist = regex_bookid.findall(text) for bookid in bookidlist: results.append((bookid,0)) if utils.parse_results_to_sql(conn, stmt, results, 1000): total = len(results) results.clear() print('插入 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, stmt, results) print('插入 ', len(results), ' 个结果到数据库成功')
def parse_detail(self): super().parse_detail() language = "EN" type = "1" medium = "2" provider = "cqjtukingbook" country = "US" batch = time.strftime('%Y%m%d') + "00" stmt = ( '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,subject,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);''') conn = utils.init_db("sqlite3", self.template_file) results = [] cnt = 0 for file, fullpath in utils.file_list(self.detail_path): with open(fullpath, encoding='utf8') as fp: txt = fp.read() try: title, creator, publishers, date, identifier_pisbn, subject, description = self._parse_detail_one( txt) except: exMsg = '* ' + traceback.format_exc() logerror(fullpath) logerror(exMsg) continue date_created = date + '0000' basename, _, ext = file.partition(".") rawid = basename provider_url = provider + "@http://123.56.143.23/kingbookwaiwen/book/info.aspx?id=" + rawid provider_id = provider + "@" + rawid lngID = "CQJTU_KINGBOOK_TS_" + rawid results.append( (lngID, rawid, title, creator, description, subject, date, date_created, identifier_pisbn, language, country, provider, provider_url, provider_id, type, medium, batch, publishers)) if utils.parse_results_to_sql(conn, stmt, results, 1000): cnt += len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt)) results.clear() utils.parse_results_to_sql(conn, stmt, results) cnt += len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt)) conn.close()
def generate_interpolations( pt_dir: str, out_video_dir: str, out_video_name: str, scene_len: int = 180, fps: int = 25, size: List[str] = [720, 1280], decay: float = 1., colors: float = 1., ): pt_file_list = file_list(pt_dir, 'pt') num_frames = scene_len * fps for pt_idx in range(len(pt_file_list)): params1 = read_pt(pt_file_list[pt_idx]) params2 = read_pt(pt_file_list[(pt_idx + 1) % len(pt_file_list)]) _params, image_f = fft_image( [1, 3, *size], resume=params1, sd=1., decay_power=decay, ) image_f = to_valid_rgb( image_f, colors=colors, ) for frame_idx in range(num_frames): with torch.no_grad(): img = image_f( (params2 - params1) * math.sin(1.5708 * frame_idx / num_frames)**2)[0].permute( 1, 2, 0) img = torch.clip(img * 255, 0, 255).cpu().numpy().astype(np.uint8) imsave( os.path.join(out_video_dir, '%05d.jpg' % (pt_idx * num_frames + frame_idx)), img) os.system('ffmpeg -v warning -y -i %s\%%05d.jpg "%s.mp4"' % (out_video_dir, out_video_name))
def parse_list(self): super().parse_list() conn = utils.init_db('mysql', 'cqjtu_kingbook') base_url = "http://123.56.143.23/kingbookwaiwen/book/" regex_bookid = re.compile(r"info.aspx\?id=(\d+)") stmt = 'insert ignore into book (bookid,stat) values(%s,%s)' results = [] for _, filename in utils.file_list(self.list_path): with open(filename, encoding='utf8') as f: text = f.read() bookidlist = regex_bookid.findall(text) for bookid in bookidlist: results.append((bookid, 0)) if utils.parse_results_to_sql(conn, stmt, results, 1000): total = len(results) results.clear() print('插入 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, stmt, results) print('插入 ', len(results), ' 个结果到数据库成功')