예제 #1
0
 def parse_index(self, message):
     try:
         utils.printf('%s:解析索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'science')
         result = []
         stmt = 'insert ignore into issue(url,stat) Values(%s,%s)'
         cnt = 0
         for filename, fullname in utils.file_list(self.index_path):
             urlf = '{}.sciencemag.org'.format(filename.split('_')[0])
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             soup = BeautifulSoup(text, 'lxml')
             divTags = soup.find_all(
                 'div',
                 class_=
                 'highwire-cite highwire-cite-highwire-issue highwire-citation-jnl-sci-issue-archive clearfix'
             )
             for divTag in divTags:
                 url = urlf + divTag.a.get('href')
                 result.append((url, 0))
             if utils.parse_results_to_sql(conn, stmt, result, 1000):
                 cnt += len(result)
                 result.clear()
                 utils.printf(cnt)
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         utils.printf(cnt)
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         self.senddistributefinish('startdown_list')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
예제 #2
0
def parse_detal():
    for file, fullpath in utils.file_list(detailpath):
        j_id = file.replace(".html", '')
        with open(fullpath, encoding='utf8') as f:
            text = f.read()
        html = Selector(text, 'html')
        title = html.xpath("//h3/text()").extract_first("")
        title_en = html.xpath("//h4/text()").extract_first("").replace(
            "'", "''")
        div = html.xpath("//div[@class='perinfo']/text()").extract()
        zbdw = dq = issn = cn = shijian = ""
        for item in div:
            if item.startswith("主办单位:"):
                zbdw = item.replace("主办单位:", "")
            if item.startswith("地区:"):
                dq = item.replace("地区:", "")
            if item.startswith("国际刊号:"):
                issn = item.replace("国际刊号:", "")
            if item.startswith("国内刊号:"):
                cn = item.replace('国内刊号:', '')
            if item.startswith("出版周期:"):
                shijian = item.replace("出版周期:", "")
        # utils.printf(title,title_en,zbdw,dq,issn,cn,shijian)
        sql = "update journal set 期刊名称_外文 = '%s' , 主办单位 = '%s' , 地区 = '%s' , 国际刊号 = '%s' , 国内刊号 = '%s' , 出版周期 = '%s' where 期刊id = '%s'" % (
            title_en, zbdw, dq, issn, cn, shijian, j_id)
        curser = db.cursor()
        curser.execute(sql)
        curser.commit()
        utils.printf("更新%s信息成功" % title)
예제 #3
0
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'ascebook')
     result = []
     stmt = 'insert ignore into book(url,cover_url) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         soup = BeautifulSoup(text, 'lxml')
         divlist = soup.select(
             '#frmSearchResults > div > div.listBody > div > div.leftSide')
         for divTag in divlist:
             url = divTag.a.get('href')
             isbn = url.split('/')[-1]
             cover_url = ''
             if not isbn.startswith('978'):
                 continue
             coverTag = divTag.a.select_one('img')
             if coverTag:
                 cover_url = coverTag.get('src')
             result.append((url, cover_url))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
def parse_vol():
    fdir = detail_path + '\\' + now_time
    sql = """
        insert ignore into detail (url) values(%s)
    """
    result = []
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    for _, fname in utils.file_list(fdir):
        x = fname.replace(fdir, '').replace(_, '')
        a = r'\\(.*)\\\d{4}\\\d+\\'
        journal_name = re.findall(a, x)[0]
        utils.printf(journal_name)
        with open(fname, encoding='utf-8') as fp:
            text = fp.read()
        html = Selector(text, 'html')
        url_list = html.xpath(
            "//div[@class='highwire-cite highwire-cite-highwire-article highwire-citation-jnl-jpet-list-complete clearfix']/a[@class='highwire-cite-linked-title']/@href"
        ).extract()
        for i, item in enumerate(url_list):
            base_url = dic_journal[journal_name].replace(
                "/content/by/year", "")
            url = base_url + item
            result.append((url))
        utils.parse_results_to_sql(conn, sql, result)
        print("插入剩下%s条成功" % len(result))
        result.clear()
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
     result = []
     stmt = 'insert ignore into article(article_id,journal_id) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         journal_id = fullname.split('\\')[-2]
         dicitem = json.loads(text, encoding='utf-8')['resultValue']
         for lanmu in dicitem.keys():
             for fenlei in dicitem[lanmu].keys():
                 for dicdetail in dicitem[lanmu][fenlei]:
                     article_id = dicdetail['id']
                     result.append((article_id, journal_id))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
     self.sendwork('down_cover')
예제 #6
0
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'hepjournal', 4)
     result = []
     stmt = 'insert ignore into article(url,journal_id) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         journal_id = filename.split('_')[0]
         sel = Selector(text=text)
         for aTag in sel.xpath('//a[@class="txt_biaoti"]'):
             url = aTag.xpath('./@href').extract_first()
             result.append((url, journal_id))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
     self.sendwork('down_cover')
예제 #7
0
 def parse_detail_meta(self, message):
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     cur = conn.cursor()
     cur.execute(
         'select gch,journal_name,journal_name_en,pissn,eissn from journal')
     rows = cur.fetchall()
     for gch, journal_name, journal_name_en, pissn, eissn in rows:
         self.dic[gch] = (journal_name, journal_name_en, pissn, eissn)
     cur.close()
     conn.close()
     self.predb3('base_obj_meta_a_template_qk.db3',
                 'base_obj_meta_a_qk.aiaajournal')
     self.sqlList.clear()
     stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date,
     vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid,
     rawid,product,sub_db,
     provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract,
     abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,
     ?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname, 'meta')
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
예제 #8
0
 def parse_indexlist(self, message):
     try:
         utils.printf('%s:解析期索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'bioonejournal')
         self.sqlList.clear()
         cnt = 0
         cur = conn.cursor()
         path = '%s/%s' % (self.datepath, 'indexlist')
         for filename, fullname in utils.file_list(path):
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             soup = BeautifulSoup(text, 'lxml')
             aTags = soup.find_all('a', class_='IssueByYearInnerText')
             for aTag in aTags:
                 url = aTag.get('href').replace('https://bioone.org', '')
                 self.sqlList.append(
                     "insert ignore into issuelist(url,year) Values('%s','%s')"
                     % (url, url.split('/')[-1]))
             cnt += len(self.sqlList)
             for sql in self.sqlList:
                 cur.execute(sql)
             conn.commit()
             self.sqlList.clear()
             utils.printf(cnt)
         cur.close()
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         # self.sendwork('down_cover')
         self.senddistributefinish('startdown_index')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
예제 #9
0
 def parse_detail(self, message):
     conn = utils.init_db('mysql', 'hepjournal', 4)
     cur = conn.cursor()
     cur.execute(
         'select journal_id,journal_name,issn,eissn,cnno from journal')
     rows = cur.fetchall()
     for journal_id, journal_name, issn, eissn, cnno in rows:
         self.dic[journal_id] = (journal_name, issn, eissn, cnno)
     cur.close()
     conn.close()
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage,
     endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn,
     identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium,
     batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
예제 #10
0
 def parse_index(self, message):
     workdir = message
     try:
         utils.printf('%s:解析索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'apsjournal')
         result = []
         stmt = 'insert ignore into issue(url,year) Values(%s,%s) on DUPLICATE key UPDATE year=%s'
         cnt = 0
         for filename, fullname in utils.file_list(workdir):
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             soup = BeautifulSoup(text, 'lxml')
             liTags = soup.select('div.volume-issue-list > ul > li')
             for liTag in liTags:
                 yeartext = liTag.get_text().strip()
                 year = re.sub('.*?(\d{4}) \(.*?\)', r'\1', yeartext)
                 url = 'https://journals.aps.org' + liTag.b.a.get('href')
                 result.append((url, year, year))
             if utils.parse_results_to_sql(conn, stmt, result, 1000):
                 cnt += len(result)
                 result.clear()
                 utils.printf(cnt)
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         utils.printf(cnt)
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         self.senddistributefinish('startdown_list')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
    def parse_html(self):
        utils.printf('%s:解析起始页开始...' % self.provider)
        conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
        result = []
        stmt = 'insert ignore into journal(journal_id,journal_name,cover_url) Values(%s,%s,%s)'
        cnt = 0
        for filename, fullname in utils.file_list(self.html_path):
            with open(fullname, encoding='utf8') as f:
                text = f.read()
            try:
                dic = json.loads(text, encoding='utf8')
                for dicitem in dic['resultValue']:
                    dicitem = json.loads(dicitem, encoding='utf8')
                    gch = dicitem['id']
                    name = dicitem['name']
                    cover_url = dicitem['volumeImg']
                    if cover_url == '':
                        cover_url = dicitem['journalImg']
                    print(gch, name, cover_url)
                    result.append((gch, name, cover_url))
            except:
                exMsg = '* ' + traceback.format_exc()
                print(exMsg)
                utils.logerror(exMsg)

        utils.parse_results_to_sql(conn, stmt, result)
        cnt += len(result)
        utils.printf(cnt)
        conn.close()
        utils.printf('%s:解析起始页完成...' % self.provider)
        self.senddistributefinish('startdown_index')
    def parse_index(self, message):
        try:
            utils.printf('%s:解析索引页开始...' % self.provider)
            conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
            self.sqlList.clear()
            cur = conn.cursor()
            for filename, fullname in utils.file_list(self.index_path):
                with open(fullname, encoding='utf8') as f:
                    text = f.read()
                dic = json.loads(text, encoding='utf-8')
                gch = filename.replace('.json', '')
                dicitem = dic['resultValue']
                issn = dicitem['issnNm']
                cnno = dicitem['cnNm']
                sql = 'update journal set issn="%s",cnno="%s" where journal_id="%s"' % (
                    issn, cnno, gch)
                cur.execute(sql)
                conn.commit()

            cur.close()
            conn.close()
            utils.printf('%s:解析索引页完成...' % self.provider)
            # self.sendwork('down_cover')
            self.senddistributefinish('get_issuelist')
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
예제 #13
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'aiaabook', 2)
     result = []
     stmt = 'insert ignore into book(book_name,url,pub_year,cover_url) Values(%s,%s,%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         try:
             sel = Selector(text=text)
             for liTag in sel.xpath('//li[@class="search-item clearfix"]'):
                 book_name = liTag.xpath(
                     './div/h4/a/text()').extract_first().strip()
                 url = liTag.xpath('./div/h4/a/@href').extract_first()
                 pub_year = liTag.xpath(
                     './/div[@class="search-item__data-group__field meta__date"]/text()'
                 ).extract_first()
                 cover_url = liTag.xpath(
                     './div/a/img/@src').extract_first().strip()
                 result.append((book_name, url, pub_year, cover_url))
             utils.printf(len(result))
         except:
             exMsg = '* ' + traceback.format_exc()
             print(exMsg)
             utils.logerror(exMsg)
             utils.logerror(fullname)
             return
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
예제 #14
0
 def parse_detail(self, message):
     conn = utils.init_db('mysql', 'aiaabook', 2)
     cur = conn.cursor()
     cur.execute('select url,pub_year from book')
     rows = cur.fetchall()
     for url, pub_year in rows:
         doi = '10.2514/' + url.split('/')[-1]
         self.dic[doi] = (pub_year)
     cur.close()
     conn.close()
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn,
      identifier_eisbn, description, publisher,cover,title_series,
      date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id,
     type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
     """
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname, 'zt')
         # print(onemessage)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
예제 #15
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'bioonejournal')
     result = []
     stmt = 'insert ignore into journal(url,cover_url) Values(%s,%s) on DUPLICATE key UPDATE cover_url=%s'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         soup = BeautifulSoup(text, 'lxml')
         aTaglist = soup.select('div.journal.BrowseTitleAll > a')
         for aTag in aTaglist:
             url = aTag.get('href')
             if url == "/journals/":
                 continue
             if url.startswith('/ebooks'):
                 continue
             cover_url = aTag.img.get('src')
             result.append((url, cover_url, cover_url))
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         result.clear()
         utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_indexlist')
예제 #16
0
def parse_list():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    result = []
    sql_in = "insert ignore into detail(provider_subject,title,url,add_time,look_time) values (%s,%s,%s,%s,%s)"
    for _, filedir in utils.file_list(list_path):
        # E:\work\美星外文\list\日文图书;随笔\2.html
        utils.printf(filedir)
        regex = r"E:\\work\\美星外文\\list\\(.*?)\\"
        provider_subject = re.findall(regex, filedir)[0]
        with open(filedir, mode='r', encoding='gb18030') as f:
            text = f.read()
        html = Selector(text, 'html')
        list_urls = html.xpath(
            "//tr[@class='tdbg_leftall']/td/strong/a/@href").extract()
        for i, item in enumerate(list_urls):
            title = html.xpath("//tr[@class='tdbg_leftall']/td/strong/a/text()"
                               ).extract()[i].split("  ")[0]
            url = "http://202.207.22.13:100/" + item
            add_time = html.xpath(
                "//tr[@class='tdbg_leftall']/td[3]/text()").extract()[i]
            look_time = html.xpath(
                "//tr[@class='tdbg_leftall']/td[4]/text()").extract()[i]
            result.append((provider_subject, title, url, add_time, look_time))
        utils.parse_results_to_sql(conn, sql_in, result)
        print('插入', len(result), ' 个结果到数据库成功')
        result.clear()
def parse_year():
    fdir = year_path + '/' + now_time
    sql = """
        insert ignore into vol (journal_name,pub_year,vol,num,vol_url) values(%s,%s,%s,%s,%s)
    """
    result = []
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    for _, fname in utils.file_list(fdir):
        utils.printf(fname)
        journal_name = fname.replace(fdir, "").replace(_, "").replace("\\", "")
        with open(fname, encoding='utf-8') as fp:
            text = fp.read()
        html = Selector(text, 'html')
        vol_urls = html.xpath(
            "//a[@class='hw-issue-meta-data']/@href").extract()
        for i, item in enumerate(vol_urls):
            base_url = dic_journal[journal_name].replace(
                "/content/by/year", "")
            vol_url = base_url + item
            pub_year = _.replace(".html", "")
            vol = html.xpath(
                "//a[@class='hw-issue-meta-data']/span[2]/text()").extract()[i]
            vol = re.findall("(\d+)", vol)[0]
            num = html.xpath(
                "//a[@class='hw-issue-meta-data']/span[3]/text()").extract()[i]
            num = re.findall("Issue\s+(.*)", num)[0].replace(" ", "_")
            result.append((journal_name, pub_year, vol, num, vol_url))
        if utils.parse_results_to_sql(conn, sql, result, 100):
            print("插入%s条成功" % len(result))
            result.clear()
    utils.parse_results_to_sql(conn, sql, result)
    print("插入剩下%s条成功" % len(result))
    result.clear()
예제 #18
0
def parse_list():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    regex_videoid = re.compile(r"/Course/Detail/(\d+)")
    stmt = 'insert ignore into video (rawid,detail_url,cover_url,provider_subject) values(%s,%s,%s,%s)'
    results = []
    for _, filename in utils.file_list(list_path):
        provider_subject = filename.replace('E:\work\\fzwjt\list\\',
                                            '').replace('\\' + _, '')
        # print(provider_subject)
        with open(filename, encoding='utf8') as f:
            text = f.read()
        html = Selector(text, 'html')
        detail_url_list = html.xpath(
            "//ul[@class='videoUl']/li/a/@href").extract()
        cover_url_list = html.xpath(
            "//ul[@class='videoUl']/li/a/img/@src").extract()
        J_detail_url_list = html.xpath(
            "//ul[@class='videoUl']/li[@class='J']/a/@href").extract()
        for x in J_detail_url_list:
            detail_url_list.remove(x)
        for i, item in enumerate(detail_url_list):
            detail_url = 'http://www.fzwjt.com' + item
            rawid = regex_videoid.findall(item)
            cover_url = cover_url_list[i]
            results.append((rawid, detail_url, cover_url, provider_subject))
        if utils.parse_results_to_sql(conn, stmt, results, 1000):
            total = len(results)
            results.clear()
            print('插入 ', total, ' 个结果到数据库成功')
    utils.parse_results_to_sql(conn, stmt, results)
    print('插入 ', len(results), ' 个结果到数据库成功')
예제 #19
0
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'apsjournal')
     result = []
     stmt = 'insert ignore into article(url,vol,issue) Values(%s,%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         vol = filename.split('_')[0]
         issue = filename.split('_')[-1].replace('.html', '')
         soup = BeautifulSoup(text, 'lxml')
         aTags = soup.select('div.large-9.columns > h5 > a')
         for aTag in aTags:
             url = aTag.get('href')
             if not url.startswith('/'):
                 continue
             url = 'https://journals.aps.org' + url
             result.append((url, vol, issue))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
예제 #20
0
 def parse_list(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     result = []
     stmt = 'insert ignore into article(id,url,vol,stat,failcount) Values(%s,%s,%s,%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         sel = Selector(text=text)
         for href in sel.xpath('//h5[@class="issue-item__title"]/a/@href'):
             url = href.extract().replace('/doi/', '/doi/abs/').strip()
             id = fullname.split('\\')[-2] + '_' + url.split('/')[-1]
             vol = filename.split('_')[0]
             print(id, url)
             result.append((id, url, vol, 0, 0))
         if utils.parse_results_to_sql(conn, stmt, result, 200):
             cnt += len(result)
             utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt))
             result.clear()
     cnt += len(result)
     utils.parse_results_to_sql(conn, stmt, result)
     utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt))
     utils.printf('%s解析列表页完成' % self.provider)
     self.senddistributefinish('startdown_detail')
예제 #21
0
 def parse_index(self, message):
     try:
         utils.printf('%s:解析索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'aiaajournal', 2)
         result = []
         stmt = 'insert ignore into issue(url,stat) Values(%s,%s)'
         cnt = 0
         for filename, fullname in utils.file_list(self.index_path):
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             sel = Selector(text=text)
             for aTag in sel.xpath('//a[@class="loi__issue__vol"]'):
                 url = aTag.xpath('./@href').extract_first()
                 if url.endswith('/0/0'):
                     continue
                 result.append(('https://arc.aiaa.org' + url, 0))
             if utils.parse_results_to_sql(conn, stmt, result, 200):
                 cnt += len(result)
                 result.clear()
                 utils.printf(cnt)
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         utils.printf(cnt)
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         self.senddistributefinish('startdown_list')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
예제 #22
0
def main():
    a = get_args()
    tempdir = os.path.join(a.out_dir, 'a')
    os.makedirs(tempdir, exist_ok=True)

    ptfiles = file_list(a.in_dir, 'pt')

    ptest = torch.load(ptfiles[0])
    if isinstance(ptest, list): ptest = ptest[0]
    shape = [*ptest.shape[:3], (ptest.shape[3] - 1) * 2]

    vsteps = int(a.length * 25 /
                 len(ptfiles)) if a.steps is None else a.steps  # 25 fps
    pbar = ProgressBar(vsteps * len(ptfiles))
    for px in range(len(ptfiles)):
        params1 = read_pt(ptfiles[px])
        params2 = read_pt(ptfiles[(px + 1) % len(ptfiles)])

        params, image_f, _ = fft_image(shape, resume=params1)
        image_f = to_valid_rgb(image_f)

        for i in range(vsteps):
            with torch.no_grad():
                img = image_f(
                    (params2 - params1) *
                    math.sin(1.5708 * i / vsteps)**2)[0].permute(1, 2, 0)
                img = torch.clip(img * 255, 0,
                                 255).cpu().numpy().astype(np.uint8)
            imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img)
            if a.verbose is True: cvshow(img)
            pbar.upd()

    os.system('ffmpeg -v warning -y -i %s/\%%05d.jpg "%s-pts.mp4"' %
              (tempdir, a.in_dir))
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'cambridgejournal')
     result = []
     issueresult = []
     stmt = 'insert ignore into article(uid,url,gch) Values(%s,%s,%s)'
     sql = 'insert ignore into issue(url,stat) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         gch = fullname.split('\\')[-2]
         soup = BeautifulSoup(text, 'lxml')
         ulTags = soup.select('ul.details')
         if len(ulTags) == 0:
             utils.logerror(fullname + '\n')
         for ulTag in ulTags:
             try:
                 aTag = ulTag.select_one('li.title > a')
                 if not aTag:
                     aTag = ulTag.select_one('li.title > h5 > a')
                 if aTag:
                     url = aTag.get('href')
                     uid = url.split('/')[-1]
                     result.append((uid, url, gch))
             except:
                 utils.printf(fullname)
                 utils.logerror(fullname)
                 break
         if filename.find('_') < 0:
             pageTag = soup.select_one('ul.pagination')
             if pageTag:
                 pTags = pageTag.select('li > a')
                 for pTag in pTags:
                     if operator.eq(pTag.get_text(), 'Last'):
                         pagenum = int(pTag.get('data-page-number'))
                         for page in range(2, pagenum + 1):
                             uri = '/core/journals/%s/issue/%s?pageNum=%s' % (
                                 gch, filename.replace('.html', ''), page)
                             issueresult.append((uri, 0))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     utils.parse_results_to_sql(conn, sql, issueresult)
     cnt += len(result)
     utils.printf(cnt)
     utils.printf('大于一页的个数为%s' % len(issueresult))
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
예제 #24
0
def parsel_detail_one():
    conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    conn_2 = sqlite3.connect('mirrorimutmeixingbook_20191218.db3')
    sub_db_id = '243'
    provider = 'mirrorimutmeixingbook'
    type = '1'
    date = '1900'
    date_created = '19000000'
    medium = '2'
    sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
    result_2 = []
    now_time = time.strftime('%Y%m%d')
    fdir = '%s/%s' % (detail_path, now_time)
    for _, filename in utils.file_list(fdir):
        rawid = _.replace(".html", "")
        with open(filename, encoding='gb18030') as f:
            text = f.read()
        html = Selector(text, "html")
        creator = html.xpath(
            "//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract(
            )[0].replace("作者:", "")
        if creator == "unknow":
            creator = ""
        if "ja" in rawid:
            id_ = rawid.replace('ja', '')
            url = "http://202.207.22.13:100/Soft_Showja.asp?SoftID=%s" % id_
            language = "JA"
            country = "JP"
            Lngid = utils.GetLngid(sub_db_id, rawid)
        else:
            language = "EN"
            country = "US"
            url = "http://202.207.22.13:100/Soft_Show.asp?SoftID=%s" % rawid
            Lngid = utils.GetLngid(sub_db_id, rawid)
        sql = "select title,provider_subject from detail where url = '%s'" % url
        cur = conn_1.cursor()
        cur.execute(sql)
        rows = cur.fetchall()
        title = rows[0][0].replace("\n", ' ')
        provider_subject = rows[0][1].replace("数字图书;", '')
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        batch = str(now_time) + '00'
        result_2.append((Lngid, rawid, provider, type, language, country,
                         provider_url, provider_id, batch, title, creator,
                         provider_subject, date, date_created, medium))
    utils.parse_results_to_sql(conn_2, sql_in, result_2)
    utils.printf("插入剩下的%s条" % len(result_2))
    result_2.clear()
예제 #25
0
def parse_detail():
    result = []
    conn_db3 = sqlite3.connect("zt_template.db3")
    sql_in = """
    insert into modify_title_info_zt (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
    """
    # 基本信息
    language = 'ZH'
    country = 'CN'
    type_ = '1'
    provider = '360elibbook'
    sub_db_id = '258'
    batch = now_time + '00'
    medium = "2"
    date = "1900"
    date_created = date + "0000"
    fdir = "%s\%s" % (detail_path,now_time)
    for _,dir_ in utils.file_list(fdir):
        utils.printf(dir_)
        pa = r"E:\\work\\360elib\\detail\\%s\\(.*)\\" % (now_time)
        provider_subject = re.findall(pa,dir_)[0]
        if provider_subject == 'None':
            provider_subject = ""
        with open(dir_,encoding='utf-8')as f:
            text = f.read()
        html = Selector(text,'html')
        rawid = _.replace(".html","")
        Lngid = utils.GetLngid(sub_db_id, rawid)
        provider_url = provider + '@' + "http://www.360elib.com:2100/chinese/web/Details.aspx?id=%s" % (rawid)
        provider_id = provider + '@' + rawid
        title = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_name']/text()").extract_first()
        creator = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_zz']/text()").extract_first("").replace(", ","").replace(",","").replace(",","").replace("、","")
        publisher = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_cbs']/text()").extract_first("")
        description = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_bookintro']/text()").extract_first("")
        cover_rawid = rawid.lower()
        cover_p = '%s/%s/%s.jpg' % (cover_path,now_time,cover_rawid)
        if os.path.exists(cover_p):
            cover = "/smartlib" + "/" + provider + "/" + cover_rawid + ".jpg"
        else:
            cover = ""
        result.append(
            (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher)
        )
        if utils.parse_results_to_sql(conn_db3, sql_in, result, 1000):
            utils.printf("插入%s条" % len(result))
            result.clear()
    utils.parse_results_to_sql(conn_db3, sql_in, result)
    utils.printf("插入剩下得%s条" % len(result))
    result.clear()
예제 #26
0
 def getpageinfo(self):
     if not self.list_path:
         self.initpath()
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         volume = filename.replace('.html', '').split('_')[0]
         issue = filename.replace('.html', '').split('_')[-1]
         sel = Selector(text=text)
         for divTag in sel.xpath('//div[@class="issue-item"]'):
             doi = '10.2514/' + divTag.xpath(
                 './h5/a/@href').extract_first().split('/')[-1]
             page = divTag.xpath('./div[@class="issue-item__pages"]/text()'
                                 ).extract_first().replace('–', '-')
             self.dic[doi] = (page, volume, issue)
예제 #27
0
def parse_list():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    regex_bookid = re.compile(r"bookinfo.aspx\?id=(\d+)")
    stmt = 'insert ignore into book (bookid,stat) values(%s,%s)'
    results = []
    for _, filename in utils.file_list(list_path):
        with open(filename, encoding='gb18030') as f:
            text = f.read()
        bookidlist = regex_bookid.findall(text)
        for bookid in bookidlist:
            results.append((bookid,0))
        if utils.parse_results_to_sql(conn, stmt, results, 1000):
            total = len(results)
            results.clear()
            print('插入 ', total, ' 个结果到数据库成功')
    utils.parse_results_to_sql(conn, stmt, results)
    print('插入 ', len(results), ' 个结果到数据库成功')
예제 #28
0
    def parse_detail(self):
        super().parse_detail()
        language = "EN"
        type = "1"
        medium = "2"
        provider = "cqjtukingbook"
        country = "US"
        batch = time.strftime('%Y%m%d') + "00"
        stmt = (
            '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,subject,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) 
                VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);''')
        conn = utils.init_db("sqlite3", self.template_file)
        results = []
        cnt = 0
        for file, fullpath in utils.file_list(self.detail_path):
            with open(fullpath, encoding='utf8') as fp:
                txt = fp.read()
            try:
                title, creator, publishers, date, identifier_pisbn, subject, description = self._parse_detail_one(
                    txt)
            except:
                exMsg = '* ' + traceback.format_exc()
                logerror(fullpath)
                logerror(exMsg)
                continue
            date_created = date + '0000'
            basename, _, ext = file.partition(".")
            rawid = basename
            provider_url = provider + "@http://123.56.143.23/kingbookwaiwen/book/info.aspx?id=" + rawid
            provider_id = provider + "@" + rawid
            lngID = "CQJTU_KINGBOOK_TS_" + rawid
            results.append(
                (lngID, rawid, title, creator, description, subject, date,
                 date_created, identifier_pisbn, language, country, provider,
                 provider_url, provider_id, type, medium, batch, publishers))
            if utils.parse_results_to_sql(conn, stmt, results, 1000):
                cnt += len(results)
                print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt))
                results.clear()

        utils.parse_results_to_sql(conn, stmt, results)
        cnt += len(results)
        print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt))
        conn.close()
예제 #29
0
def generate_interpolations(
    pt_dir: str,
    out_video_dir: str,
    out_video_name: str,
    scene_len: int = 180,
    fps: int = 25,
    size: List[str] = [720, 1280],
    decay: float = 1.,
    colors: float = 1.,
):
    pt_file_list = file_list(pt_dir, 'pt')
    num_frames = scene_len * fps

    for pt_idx in range(len(pt_file_list)):
        params1 = read_pt(pt_file_list[pt_idx])
        params2 = read_pt(pt_file_list[(pt_idx + 1) % len(pt_file_list)])

        _params, image_f = fft_image(
            [1, 3, *size],
            resume=params1,
            sd=1.,
            decay_power=decay,
        )
        image_f = to_valid_rgb(
            image_f,
            colors=colors,
        )

        for frame_idx in range(num_frames):
            with torch.no_grad():
                img = image_f(
                    (params2 - params1) *
                    math.sin(1.5708 * frame_idx / num_frames)**2)[0].permute(
                        1, 2, 0)
                img = torch.clip(img * 255, 0,
                                 255).cpu().numpy().astype(np.uint8)

            imsave(
                os.path.join(out_video_dir,
                             '%05d.jpg' % (pt_idx * num_frames + frame_idx)),
                img)

    os.system('ffmpeg -v warning -y -i %s\%%05d.jpg "%s.mp4"' %
              (out_video_dir, out_video_name))
예제 #30
0
 def parse_list(self):
     super().parse_list()
     conn = utils.init_db('mysql', 'cqjtu_kingbook')
     base_url = "http://123.56.143.23/kingbookwaiwen/book/"
     regex_bookid = re.compile(r"info.aspx\?id=(\d+)")
     stmt = 'insert ignore into book (bookid,stat) values(%s,%s)'
     results = []
     for _, filename in utils.file_list(self.list_path):
         with open(filename, encoding='utf8') as f:
             text = f.read()
         bookidlist = regex_bookid.findall(text)
         for bookid in bookidlist:
             results.append((bookid, 0))
         if utils.parse_results_to_sql(conn, stmt, results, 1000):
             total = len(results)
             results.clear()
             print('插入 ', total, ' 个结果到数据库成功')
     utils.parse_results_to_sql(conn, stmt, results)
     print('插入 ', len(results), ' 个结果到数据库成功')