Пример #1
0
def init_config():
    global args
    init_db()
    arg = ArgumentParser(description='CoreMail Upload Vul')
    arg.add_argument("-u", "--url", help="Target URL; Example:http://ip:port")
    arg.add_argument("-f", "--file", help="Target URL; Example:url.txt")
    args = arg.parse_args()
Пример #2
0
def main():
    while True:
        start()
        desision = input('Write your desision: ')
        if not desision.isdigit():
            print('Wrong input')
            continue
        desision = int(desision)
        if not desision:
            print('See you next time')
            break
        elif desision == 1:
            show_all()
        elif desision == 2:
            write_new_record()
        elif desision == 3:
            edit_record()
        elif desision == 4:
            remove_record()
        elif desision == 5:
            search_by_id()
        elif desision == 6:
            filter_by_keyword()
        elif desision == 9:
            init_db()
Пример #3
0
 def process_list(self, message):
     self.count = self.count + 1
     sql = "update issue set stat=1 where url='{}'".format(message)
     self.sqlList.append(sql)
     if self.count % 40 == 1:
         utils.printf('%s:下载成功 %s 页' % (self.provider, self.count))
         conn = utils.init_db('mysql', 'aiaajournal', 2)
         cur = conn.cursor()
         for sql in self.sqlList:
             cur.execute(sql)
         conn.commit()
         conn.close()
         self.sqlList.clear()
     if self.count % 100 == 0:
         self.refreshproxypool()
     if self.count == self.totalcount:
         conn = utils.init_db('mysql', 'aiaajournal', 2)
         cur = conn.cursor()
         for sql in self.sqlList:
             cur.execute(sql)
         conn.commit()
         conn.close()
         self.sqlList.clear()
         utils.printf('downloadlist finish')
         self.sendwork('parse_list')
 def startdown_detail(self, message):
     if not self.detail_path:
         self.initpath()
     self.sqlList.clear()
     self.refreshproxypool()
     self.count = 0
     conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
     cur = conn.cursor()
     cur.execute(
         'select article_id,journal_id from article where stat=0 and failcount<3'
     )
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         utils.printf('%s:下载详情页完成' % self.provider)
         # self.sendwork('parse_detail_meta')
         self.sendwork('parse_detail')
         # self.sendwork('down_cover')
         return
     messagelist = []
     for article_id, journal_id in rows:
         fdir = '%s/%s' % (self.detail_path, journal_id)
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         messagelist.append((article_id, journal_id))
         if len(messagelist) == 30:
             blist = messagelist.copy()
             self.sendwork('down_detail', blist)
             # utils.printf('a'+len(messagelist))
             # utils.printf(messagelist)
             messagelist.clear()
     if len(messagelist) > 0:
         self.sendwork('down_detail', messagelist)
Пример #5
0
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'apsjournal')
     result = []
     stmt = 'insert ignore into article(url,vol,issue) Values(%s,%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         vol = filename.split('_')[0]
         issue = filename.split('_')[-1].replace('.html', '')
         soup = BeautifulSoup(text, 'lxml')
         aTags = soup.select('div.large-9.columns > h5 > a')
         for aTag in aTags:
             url = aTag.get('href')
             if not url.startswith('/'):
                 continue
             url = 'https://journals.aps.org' + url
             result.append((url, vol, issue))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
Пример #6
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'bioonejournal')
     result = []
     stmt = 'insert ignore into journal(url,cover_url) Values(%s,%s) on DUPLICATE key UPDATE cover_url=%s'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         soup = BeautifulSoup(text, 'lxml')
         aTaglist = soup.select('div.journal.BrowseTitleAll > a')
         for aTag in aTaglist:
             url = aTag.get('href')
             if url == "/journals/":
                 continue
             if url.startswith('/ebooks'):
                 continue
             cover_url = aTag.img.get('src')
             result.append((url, cover_url, cover_url))
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         result.clear()
         utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_indexlist')
Пример #7
0
 def parse_detail(self, message):
     conn = utils.init_db('mysql', 'aiaabook', 2)
     cur = conn.cursor()
     cur.execute('select url,pub_year from book')
     rows = cur.fetchall()
     for url, pub_year in rows:
         doi = '10.2514/' + url.split('/')[-1]
         self.dic[doi] = (pub_year)
     cur.close()
     conn.close()
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, identifier_pisbn,
      identifier_eisbn, description, publisher,cover,title_series,
      date,date_created, price, language, country, provider, provider_url, identifier_doi, provider_id,
     type,medium, batch) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
     """
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname, 'zt')
         # print(onemessage)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
Пример #8
0
 def parse_indexlist(self, message):
     try:
         utils.printf('%s:解析期索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'bioonejournal')
         self.sqlList.clear()
         cnt = 0
         cur = conn.cursor()
         path = '%s/%s' % (self.datepath, 'indexlist')
         for filename, fullname in utils.file_list(path):
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             soup = BeautifulSoup(text, 'lxml')
             aTags = soup.find_all('a', class_='IssueByYearInnerText')
             for aTag in aTags:
                 url = aTag.get('href').replace('https://bioone.org', '')
                 self.sqlList.append(
                     "insert ignore into issuelist(url,year) Values('%s','%s')"
                     % (url, url.split('/')[-1]))
             cnt += len(self.sqlList)
             for sql in self.sqlList:
                 cur.execute(sql)
             conn.commit()
             self.sqlList.clear()
             utils.printf(cnt)
         cur.close()
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         # self.sendwork('down_cover')
         self.senddistributefinish('startdown_index')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
Пример #9
0
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'ascebook')
     result = []
     stmt = 'insert ignore into book(url,cover_url) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         soup = BeautifulSoup(text, 'lxml')
         divlist = soup.select(
             '#frmSearchResults > div > div.listBody > div > div.leftSide')
         for divTag in divlist:
             url = divTag.a.get('href')
             isbn = url.split('/')[-1]
             cover_url = ''
             if not isbn.startswith('978'):
                 continue
             coverTag = divTag.a.select_one('img')
             if coverTag:
                 cover_url = coverTag.get('src')
             result.append((url, cover_url))
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
Пример #10
0
 def startdown_list(self, message):
     utils.printf('%s:开始下载列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.refreshproxypool()
     self.count = 0
     conn = utils.init_db('mysql', 'science')
     cur = conn.cursor()
     cur.execute('select url,stat from issue where stat=0')
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         if len(os.listdir(self.index_path)) == 0:
             utils.logerror('%s:没有新的issue不需要更新' % self.provider)
             utils.msg2weixin('%s:没有新的issue不需要更新' % self.provider)
         else:
             self.sendwork('parse_list')
     for url, _ in rows:
         fdir = self.list_path + '/' + url.split('.')[0]
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         fname = fdir + '/' + url.split('/')[-2] + '_' + url.split(
             '/')[-1] + '.html'
         url = 'http://' + url
         self.sendwork('down_list', (url, fname))
Пример #11
0
 def parse_html(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'aiaabook', 2)
     result = []
     stmt = 'insert ignore into book(book_name,url,pub_year,cover_url) Values(%s,%s,%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.html_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         try:
             sel = Selector(text=text)
             for liTag in sel.xpath('//li[@class="search-item clearfix"]'):
                 book_name = liTag.xpath(
                     './div/h4/a/text()').extract_first().strip()
                 url = liTag.xpath('./div/h4/a/@href').extract_first()
                 pub_year = liTag.xpath(
                     './/div[@class="search-item__data-group__field meta__date"]/text()'
                 ).extract_first()
                 cover_url = liTag.xpath(
                     './div/a/img/@src').extract_first().strip()
                 result.append((book_name, url, pub_year, cover_url))
             utils.printf(len(result))
         except:
             exMsg = '* ' + traceback.format_exc()
             print(exMsg)
             utils.logerror(exMsg)
             utils.logerror(fullname)
             return
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析起始页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
Пример #12
0
 def down_detail(self):
     utils.printf("下载详情页开始...")
     super().down_detail()
     conn = utils.init_db('mysql', 'cqjtu_kingbook')
     cur = conn.cursor()
     while True:
         cur.execute(
             'select bookid,stat from book where stat=0 limit 10000')
         rows = cur.fetchall()
         conn.commit()
         if len(rows) == 0:
             break
         for bookid, _ in rows:
             print(bookid)
             url = 'http://123.56.143.23/kingbookwaiwen/book/info.aspx?id={}'.format(
                 bookid)
             dirname = '%s/%s' % (self.detail_path, bookid[:3])
             if not os.path.exists(dirname):
                 os.makedirs(dirname)
             filename = '%s/%s.html' % (dirname, bookid)
             if os.path.exists(filename):
                 sql = 'update book set stat=1 where bookid="{}"'.format(
                     bookid)
                 cur.execute(sql)
                 conn.commit()
                 continue
             resp = utils.get_html(url, proxies=self.proxy)
             if not resp:
                 continue
             with open(filename, mode='w', encoding='utf8') as f:
                 f.write(resp.content.decode())
             sql = 'update book set stat=1 where bookid="{}"'.format(bookid)
             cur.execute(sql)
             conn.commit()
             utils.printf("下载", bookid, "成功...")
Пример #13
0
 def parse_index(self, message):
     workdir = message
     try:
         utils.printf('%s:解析索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'apsjournal')
         result = []
         stmt = 'insert ignore into issue(url,year) Values(%s,%s) on DUPLICATE key UPDATE year=%s'
         cnt = 0
         for filename, fullname in utils.file_list(workdir):
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             soup = BeautifulSoup(text, 'lxml')
             liTags = soup.select('div.volume-issue-list > ul > li')
             for liTag in liTags:
                 yeartext = liTag.get_text().strip()
                 year = re.sub('.*?(\d{4}) \(.*?\)', r'\1', yeartext)
                 url = 'https://journals.aps.org' + liTag.b.a.get('href')
                 result.append((url, year, year))
             if utils.parse_results_to_sql(conn, stmt, result, 1000):
                 cnt += len(result)
                 result.clear()
                 utils.printf(cnt)
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         utils.printf(cnt)
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         self.senddistributefinish('startdown_list')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
 def get_issuelist(self, message):
     utils.printf('%s:开始获取期列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.refreshproxypool()
     self.sqlList.clear()
     self.count = 0
     conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
     cur = conn.cursor()
     cur.execute('select journal_id,journal_name from journal')
     rows = cur.fetchall()
     utils.printf(rows)
     for journal_id, name in rows:
         text = None
         while True:
             url = 'http://www.engineering.org.cn/default/journal/CurrentIssue/AllVolumeId?journalId=%s' % journal_id
             utils.printf(url)
             resp = self.gethtml(url, '"success":true', None)
             if resp:
                 text = resp.content.decode('utf8')
                 break
         dic = json.loads(text, encoding='utf-8')
         index = 1
         for issue_id in dic['resultValue']:
             sql = 'insert into issue(journal_id,issue_id,issue_index) Values(%s,%s,%s) on DUPLICATE key UPDATE issue_index=%s' % (
                 journal_id, issue_id, index, index)
             cur.execute(sql)
             index += 1
         conn.commit()
         utils.printf('%s:插入%s期' % (self.provider, index))
     conn.close()
     self.senddistributefinish('startdown_list')
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
     result = []
     stmt = 'insert ignore into article(article_id,journal_id) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         journal_id = fullname.split('\\')[-2]
         dicitem = json.loads(text, encoding='utf-8')['resultValue']
         for lanmu in dicitem.keys():
             for fenlei in dicitem[lanmu].keys():
                 for dicdetail in dicitem[lanmu][fenlei]:
                     article_id = dicdetail['id']
                     result.append((article_id, journal_id))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
     self.sendwork('down_cover')
    def parse_index(self, message):
        try:
            utils.printf('%s:解析索引页开始...' % self.provider)
            conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
            self.sqlList.clear()
            cur = conn.cursor()
            for filename, fullname in utils.file_list(self.index_path):
                with open(fullname, encoding='utf8') as f:
                    text = f.read()
                dic = json.loads(text, encoding='utf-8')
                gch = filename.replace('.json', '')
                dicitem = dic['resultValue']
                issn = dicitem['issnNm']
                cnno = dicitem['cnNm']
                sql = 'update journal set issn="%s",cnno="%s" where journal_id="%s"' % (
                    issn, cnno, gch)
                cur.execute(sql)
                conn.commit()

            cur.close()
            conn.close()
            utils.printf('%s:解析索引页完成...' % self.provider)
            # self.sendwork('down_cover')
            self.senddistributefinish('get_issuelist')
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
 def startdown_list(self, message):
     utils.printf('%s:开始下载列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.sqlList.clear()
     self.refreshproxypool()
     self.count = 0
     conn = utils.init_db('mysql', 'cambridgejournal')
     cur = conn.cursor()
     cur.execute('select url,stat from issue where stat=0')
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         if len(os.listdir(self.list_path)) == 0:
             utils.logerror('%s:没有新的issue不需要更新' % self.provider)
         else:
             # self.sendwork('down_cover')
             self.sendwork('parse_list')
             return
     self.refreshsession()
     for url, _ in rows:
         fdir = self.list_path + '/' + url.split('/')[-3]
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         flast = url.split('/')[-1]
         if flast.find('?pageNum=') > 0:
             flast = flast.split('?')[0] + '_' + flast.split('=')[-1]
         fname = fdir + '/' + flast + '.html'
         self.sendwork('down_list', (url, fname))
Пример #18
0
 def parse_list(self, message):
     utils.printf('%s:解析列表页开始...' % self.provider)
     conn = utils.init_db('mysql', 'hepjournal', 4)
     result = []
     stmt = 'insert ignore into article(url,journal_id) Values(%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         journal_id = filename.split('_')[0]
         sel = Selector(text=text)
         for aTag in sel.xpath('//a[@class="txt_biaoti"]'):
             url = aTag.xpath('./@href').extract_first()
             result.append((url, journal_id))
         if utils.parse_results_to_sql(conn, stmt, result, 1000):
             cnt += len(result)
             result.clear()
             utils.printf(cnt)
     utils.parse_results_to_sql(conn, stmt, result)
     cnt += len(result)
     utils.printf(cnt)
     conn.close()
     utils.printf('%s:解析列表页完成...' % self.provider)
     self.senddistributefinish('startdown_detail')
     self.sendwork('down_cover')
Пример #19
0
 def startdown_list(self, message):
     utils.printf('%s:开始下载列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.refreshproxypool()
     self.sqlList.clear()
     self.count = 0
     conn = utils.init_db('mysql', 'hepjournal', 4)
     cur = conn.cursor()
     cur.execute('select url,journal_id from issue where stat=0')
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         if len(os.listdir(self.list_path)) == 0:
             utils.logerror('%s:没有新的issue不需要更新' % self.provider)
         else:
             # self.sendwork('down_cover')
             self.sendwork('parse_list')
     for url, journal_id in rows:
         fdir = self.list_path + '/' + journal_id
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         fname = fdir + '/' + journal_id + '_' + url.split(
             '/')[-2] + '_' + url.split('/')[-1].replace('.shtml', '.html')
         self.sendwork('down_list', (url, fname))
Пример #20
0
 def parse_index(self, message):
     try:
         utils.printf('%s:解析索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'aiaajournal', 2)
         result = []
         stmt = 'insert ignore into issue(url,stat) Values(%s,%s)'
         cnt = 0
         for filename, fullname in utils.file_list(self.index_path):
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             sel = Selector(text=text)
             for aTag in sel.xpath('//a[@class="loi__issue__vol"]'):
                 url = aTag.xpath('./@href').extract_first()
                 if url.endswith('/0/0'):
                     continue
                 result.append(('https://arc.aiaa.org' + url, 0))
             if utils.parse_results_to_sql(conn, stmt, result, 200):
                 cnt += len(result)
                 result.clear()
                 utils.printf(cnt)
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         utils.printf(cnt)
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         self.senddistributefinish('startdown_list')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
Пример #21
0
 def parse_list(self, message):
     utils.printf('%s:解析起始页开始...' % self.provider)
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     result = []
     stmt = 'insert ignore into article(id,url,vol,stat,failcount) Values(%s,%s,%s,%s,%s)'
     cnt = 0
     for filename, fullname in utils.file_list(self.list_path):
         with open(fullname, encoding='utf8') as f:
             text = f.read()
         sel = Selector(text=text)
         for href in sel.xpath('//h5[@class="issue-item__title"]/a/@href'):
             url = href.extract().replace('/doi/', '/doi/abs/').strip()
             id = fullname.split('\\')[-2] + '_' + url.split('/')[-1]
             vol = filename.split('_')[0]
             print(id, url)
             result.append((id, url, vol, 0, 0))
         if utils.parse_results_to_sql(conn, stmt, result, 200):
             cnt += len(result)
             utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt))
             result.clear()
     cnt += len(result)
     utils.parse_results_to_sql(conn, stmt, result)
     utils.printf('%s解析%s条数据到数据库' % (self.provider, cnt))
     utils.printf('%s解析列表页完成' % self.provider)
     self.senddistributefinish('startdown_detail')
Пример #22
0
 def run(self):
     conn = utils.init_db('mysql', 'aipjournal')
     cur = conn.cursor()
     sql = "select url,stat from issue where stat=0 limit 1000;"
     time_last = time.time()
     cnt = 0
     while True:
         if url_queue.empty():
             cur.execute(sql)
             rows = cur.fetchall()
             conn.commit()
             if rows:
                 for row in rows:
                     url_queue.put(row)
             elif sql_queue.empty():
                 break
         time_now = time.time()
         if (sql_queue.qsize() > 100) or (time_now - time_last > 60):
             num = sql_queue.qsize()
             while num > 0:
                 url, flag = sql_queue.get()
                 cur.execute(
                     "update issue set stat={} where url='{}'".format(
                         flag, url))
                 cnt += 1
                 num -= 1
             conn.commit()
             utils.printf('succssed:%d' % (cnt))
             time_last = time.time()
         time.sleep(1)
Пример #23
0
 def parse_detail_meta(self, message):
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     cur = conn.cursor()
     cur.execute(
         'select gch,journal_name,journal_name_en,pissn,eissn from journal')
     rows = cur.fetchall()
     for gch, journal_name, journal_name_en, pissn, eissn in rows:
         self.dic[gch] = (journal_name, journal_name_en, pissn, eissn)
     cur.close()
     conn.close()
     self.predb3('base_obj_meta_a_template_qk.db3',
                 'base_obj_meta_a_qk.aiaajournal')
     self.sqlList.clear()
     stmt = """insert into base_obj_meta_a (author,author_1st,organ,organ_1st,title,title_alt,keyword,pub_year,pub_date,
     vol,num,journal_raw_id,journal_name,journal_name_alt,page_info,begin_page,end_page,subject,is_oa,down_cnt,lngid,
     rawid,product,sub_db,
     provider,sub_db_id,source_type,provider_url,country,language,batch,down_date,publisher,issn,eissn,abstract,
     abstract_alt,doi,fund,ref_cnt,fulltext_type) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,
     ?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname, 'meta')
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
Пример #24
0
 def startdown_list(self, message):
     utils.printf('%s:开始下载列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.refreshproxypool()
     self.sqlList.clear()
     self.count = 0
     conn = utils.init_db('mysql', 'aiaajournal', 2)
     cur = conn.cursor()
     cur.execute('select url,stat from issue where stat=0')
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         if len(os.listdir(self.list_path)) == 0:
             utils.logerror('%s:没有新的issue不需要更新' % self.provider)
         else:
             self.sendwork('parse_list')
     for url, _ in rows:
         urlsp = url.split('/')
         base_name = '%s_%s.html' % (urlsp[-2], urlsp[-1])
         fdir = '%s/%s' % (self.list_path, urlsp[-3])
         fname = '%s/%s' % (fdir, base_name)
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         self.sendwork('down_list', (url, fname))
Пример #25
0
 def parse_detail(self, message):
     conn = utils.init_db('mysql', 'hepjournal', 4)
     cur = conn.cursor()
     cur.execute(
         'select journal_id,journal_name,issn,eissn,cnno from journal')
     rows = cur.fetchall()
     for journal_id, journal_name, issn, eissn, cnno in rows:
         self.dic[journal_id] = (journal_name, issn, eissn, cnno)
     cur.close()
     conn.close()
     self.predb3()
     self.sqlList.clear()
     stmt = """insert or ignore into modify_title_info_zt(lngid, rawid, creator, title, volume, issue, page, beginpage,
     endpage, publisher, subject, date,creator_institution, date_created, source, identifier_pissn, identifier_eissn,
     identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type, medium,
     batch, gch)values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
     count = 0
     for filename, fullname in utils.file_list(self.detail_path):
         onemessage = self.parse_detail_one(filename, fullname)
         if onemessage:
             self.sqlList.append(onemessage)
         if utils.parse_results_to_sql(self.conn, stmt, self.sqlList, 50):
             count += len(self.sqlList)
             utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
             self.sqlList.clear()
     utils.parse_results_to_sql(self.conn, stmt, self.sqlList)
     count += len(self.sqlList)
     utils.printf('%s: 插入 %d 条数据到db3' % (self.provider, count))
     self.conn.close()
     self.conn = None
     utils.msg2weixin('%s: 解析完成,成品文件为%s' %
                      (self.provider, self.template_file))
    def parse_html(self):
        utils.printf('%s:解析起始页开始...' % self.provider)
        conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
        result = []
        stmt = 'insert ignore into journal(journal_id,journal_name,cover_url) Values(%s,%s,%s)'
        cnt = 0
        for filename, fullname in utils.file_list(self.html_path):
            with open(fullname, encoding='utf8') as f:
                text = f.read()
            try:
                dic = json.loads(text, encoding='utf8')
                for dicitem in dic['resultValue']:
                    dicitem = json.loads(dicitem, encoding='utf8')
                    gch = dicitem['id']
                    name = dicitem['name']
                    cover_url = dicitem['volumeImg']
                    if cover_url == '':
                        cover_url = dicitem['journalImg']
                    print(gch, name, cover_url)
                    result.append((gch, name, cover_url))
            except:
                exMsg = '* ' + traceback.format_exc()
                print(exMsg)
                utils.logerror(exMsg)

        utils.parse_results_to_sql(conn, stmt, result)
        cnt += len(result)
        utils.printf(cnt)
        conn.close()
        utils.printf('%s:解析起始页完成...' % self.provider)
        self.senddistributefinish('startdown_index')
Пример #27
0
 def parse_index(self, message):
     try:
         utils.printf('%s:解析索引页开始...' % self.provider)
         conn = utils.init_db('mysql', 'science')
         result = []
         stmt = 'insert ignore into issue(url,stat) Values(%s,%s)'
         cnt = 0
         for filename, fullname in utils.file_list(self.index_path):
             urlf = '{}.sciencemag.org'.format(filename.split('_')[0])
             with open(fullname, encoding='utf8') as f:
                 text = f.read()
             soup = BeautifulSoup(text, 'lxml')
             divTags = soup.find_all(
                 'div',
                 class_=
                 'highwire-cite highwire-cite-highwire-issue highwire-citation-jnl-sci-issue-archive clearfix'
             )
             for divTag in divTags:
                 url = urlf + divTag.a.get('href')
                 result.append((url, 0))
             if utils.parse_results_to_sql(conn, stmt, result, 1000):
                 cnt += len(result)
                 result.clear()
                 utils.printf(cnt)
         utils.parse_results_to_sql(conn, stmt, result)
         cnt += len(result)
         utils.printf(cnt)
         conn.close()
         utils.printf('%s:解析索引页完成...' % self.provider)
         self.senddistributefinish('startdown_list')
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         utils.logerror(exMsg)
Пример #28
0
 def startdown_list(self, message):
     utils.printf('%s:开始下载列表页...' % self.provider)
     if not self.list_path:
         self.initpath()
     self.refreshproxypool()
     self.count = 0
     conn = utils.init_db('mysql', 'apsjournal')
     cur = conn.cursor()
     current_year = time.strftime('%Y')
     cur.execute(
         "select url,stat from issue where stat=0 or year=%s or year=%s" %
         (current_year, int(current_year) - 1))
     rows = cur.fetchall()
     self.totalcount = len(rows)
     if self.totalcount == 0:
         if len(os.listdir(self.list_path)) == 0:
             utils.logerror('%s:没有新的issue不需要更新' % self.provider)
         else:
             self.sendwork('parse_list')
     for url, _ in rows:
         fdir = self.list_path + '/' + url.split('/')[-4]
         if not os.path.exists(fdir):
             os.makedirs(fdir)
         fname = fdir + '/' + url.split('/')[-2] + '_' + url.split(
             '/')[-1] + '.html'
         self.sendwork('down_list', (url, fname))
 def down_cover(self, message):
     utils.printf('开始下载图片')
     if not self.cover_path:
         self.initpath()
     self.refreshproxypool()
     conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
     cur = conn.cursor()
     cur.execute(
         "select journal_id,cover_url from journal where cover_url!=''")
     rows = cur.fetchall()
     HEADER = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
     }
     for journal_id, cover_url in rows:
         filename = self.cover_path + '/' + journal_id + '.jpg'
         if os.path.exists(filename):
             continue
         while True:
             try:
                 proxy = self.getproxy()
                 proxies = {'http': proxy, 'https': proxy}
                 resp = requests.get(cover_url,
                                     headers=HEADER,
                                     timeout=20,
                                     proxies=proxies)
                 # resp = requests.get(cover_url, headers=HEADER, timeout=20)
             except:
                 utils.printf(filename)
                 continue
             if utils.Img2Jpg(resp.content, filename):
                 utils.printf('下载图片%s成功' % filename)
                 break
     self.sendwork('mapcover')
Пример #30
0
def init_config():
    parser = argparse.ArgumentParser()     
    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(module)10s] [%(levelname)5s] %(message)s')

    load   = {}
    config_file = "config.json"
    if os.path.isfile(config_file):
        with open(config_file) as data:
            load.update(json.load(data))

    parser.add_argument("-a", "--auth_service", help="Auth Service ('ptc' or 'google')", default="ptc")
    parser.add_argument("-u", "--username", help="Username")
    parser.add_argument("-p", "--password", help="Password")
    parser.add_argument("-l", "--location", help="Location")
    parser.add_argument("-r", "--radius", help="area circle radius", type=int)
    parser.add_argument("-w", "--width", help="area square width", type=int)
    parser.add_argument("-f", "--dbfile", help="DB filename", default='db.sqlite')
    parser.add_argument("--level", help="cell level used for tiling", default=13, type=int)
    parser.add_argument("-t", "--delay", help="rpc request interval", default=10, type=int)
    parser.add_argument("-d", "--debug", help="Debug Mode", action='store_true', default=0)
    parser.add_argument("-n", "--test", help="Beta algorithm", action='store_true', default=0)        
    config = parser.parse_args()

    for key in config.__dict__:
        if key in load and config.__dict__[key] == None:
            config.__dict__[key] = load[key]

    if config.auth_service not in ['ptc', 'google']:
        log.error("Invalid Auth service specified! ('ptc' or 'google')")
        return None

    if config.debug:
        logging.getLogger("requests").setLevel(logging.DEBUG)
        logging.getLogger("pgoapi").setLevel(logging.DEBUG)
        logging.getLogger("rpc_api").setLevel(logging.DEBUG)
    else:
        logging.getLogger("requests").setLevel(logging.WARNING)
        logging.getLogger("pgoapi").setLevel(logging.WARNING)
        logging.getLogger("rpc_api").setLevel(logging.WARNING)
   
    dbversion = check_db(config.dbfile)     
    if dbversion != VERSION:
        log.error('Database version mismatch! Expected {}, got {}...'.format(VERSION,dbversion))
        return
    
    if config.location:
        from utils import get_pos_by_name
        lat, lng, alt = get_pos_by_name(config.location); del alt
        if config.radius:
            cells = cover_circle(lat, lng, config.radius, config.level)
        elif config.width:
            cells = cover_square(lat, lng, config.width, config.level)
        else: log.error('Area size not given!'); return
        log.info('Added %d cells to scan queue.' % init_db(cells, config.dbfile))
        del cells, lat, lng
    
    return config
Пример #31
0
      
def gettagarts(tag):
    db = bottle.local.db
    ret = []
    if tag in db:
        tl = db[tag]
    else:
         return ret
    for a in tl:
        art = db[a]
        ret.insert(0, Article(art['title'], art['content'], art['time'], art['tags']))
    return ret
    
if __name__ == "__main__":  
    import utils
    db = utils.init_db().connect()
    db.clear()
    db['tags'] = ['engadege', 'weiphone']
    
    title = '''外媒评出2010年十大最令人难忘产品'''
    content = '''<div id="read_content">

                 <P align=center><IMG border=0 src="http://resource.weiphone.com/resource/h003/h57/img201012041143530.jpg"><!--威锋网www.weiphone.com版权所有61.174.61.178 --></p>

<P>  公关和市场传播公司Schneider Associates评出了2009至2010年最令人难忘的十大产品,其中苹果iPad、微软Windows 7、摩托罗拉Droid智能手机和三星3D电视上榜,高科技产品在该榜单中占据重要地位。<BR>  <BR><STRONG>第一名:苹果iPad</STRONG><!--威锋网www.weiphone.com版权所有61.174.61.178 --></p>

<P>  今年,iPad的销量已经超过了MacBook笔记本电脑。分析师预计,第四季度这款设备的销量可达630万部。<BR>  <BR><STRONG>第二名:微软Windows 7操作系统<BR></STRONG>  <BR>  微软最新操作系统——Windows 7支持触摸屏,不同设备间的分享功能更强,文件和程序的访问速度更快。在开发过程中,微软大量听取了消费者的意见,而消费者也非常喜欢这款产品。<BR>  <BR><STRONG>第三名:玛氏糖果公司的Pretzel M&amp;Ms<BR></STRONG>  <BR>  这家糖果巨头满足了节俭消费者对廉价甜食的需求。实际上,它只不过是一颗夹心巧克力糖,这表明有时候好的想法其实很简单。<BR>  <BR><STRONG>第四名:摩托罗拉Droid手机<BR></STRONG>  <BR>  摩托罗拉Droid手机的宣传口号是“多任务运行”、“500万像素摄像头”和“黑暗环境中拍照”等,皆为<a href=http://iphone.weiphone.com>iPhone</a>的软肋。而这种广告营销策略在对抗<a href=http://iphone.weiphone.com>iPhone</a>这一强大对手时十分有效。<BR>  <BR><STRONG>第五名:麦当劳水果冰沙McCafé Real Fruit Smoothies<BR></STRONG>  <BR>  麦当劳效仿Jamba Juice推出水果冰沙,价格却要便宜得多,受到了消费者的欢迎。<BR>  <BR><STRONG>第六名:苹果iPod Nano<BR></STRONG>  <BR>  Nano 6的体积更小,并具备了多点触摸功能。<BR>  <BR><STRONG>第七名:星巴克Via速溶咖啡<BR></STRONG>  <BR>  当星巴克发布Via时,外界批评声不断。但仅仅10个月,这种饮品的销售额已超过1亿美元。<BR>  <BR><STRONG>第八名:三星3D电视<BR></STRONG>  <BR>  《阿凡达》激发了3D热潮,三星迅速作出反应,于今年早些时候发布了全球第一款3D液晶电视。<BR>  <BR><STRONG>第九名:哈吉斯牛仔裤尿布<BR></STRONG>  <BR>  这款限量版牛仔裤尿布的广告语是:“你所见过的最酷拉裤子方式”。<BR>  <BR><STRONG>第十名:Kleenex纸巾<BR></STRONG>  <BR>  这款纸巾由100%的原生纤维制成,可以回收利用。<!--威锋网www.weiphone.com版权所有61.174.61.178 --></p>                   <br />

         <span>  </span> 

Пример #32
0
def initdb_command():
    """Creates the database tables."""
    utils.init_db()
    print('Initialized the database.')
Пример #33
0
def init_db():
    """Creates the database tables."""
    utils.init_db()
    return 'Initialized the database.'
Пример #34
0
 def __init__(self):
     self.sqlcon, self.sqlcursor = utils.init_db()
     self.history = None
Пример #35
0
from bottle import route, run, view, debug, static_file
import models, utils

@route('/')
@view('index')
def index():
    return dict(articles = models.getarts(), tags = models.gettags())

@route('/tags/:tag')
@view('index')
def tag(tag):
    return dict(articles = models.gettagarts(tag), tags = models.gettags(), curtag = tag)

@route('/static/:path#.+#')
def server_static(path):
    return static_file(path, root='static')

@route('/download/:filename')
def download(filename):
    return static_file(filename, root='download', download=filename)

debug(True)
utils.init_db('data.db')

run(server='flup', reloader=True) 
Пример #36
0
def _fetchall():
    global last_line_result
    global err_level
    
    print(' == fetching line')
    try:
        evt_info,predict=_fetch_line()
    except Exception as e:
        if last_line_result:
            err_level+=3
            log('error','档线获取失败,使用上次结果')
            evt_info,predict=last_line_result
        else:
            raise
    else:
        last_line_result=evt_info,predict
    
    eventid=evt_info['id']
    score_parser=parse_score_meta(eventid)

    if not os.path.exists('db/%d.db'%eventid): # init db
        print(' -> new event: event #%d %s'%(eventid,evt_info['title']))
        print(' -> creating database and writing event info...')
        init_db(eventid)
        with sqlite3.connect('events.db') as db:
            db.execute(
                'insert or replace into events (id, title, begin, end, last_update, score_parser) '
                'values (?,?,?,?,null,(select score_parser from events where id=?))',
                [eventid,evt_info['title'],int(evt_info['begin'].timestamp()),int(evt_info['end'].timestamp()),eventid]
            )
    if datetime.datetime.now()-datetime.timedelta(minutes=3)>evt_info['end']:
        log('debug','活动 #%d 结束,爬虫停止抓取'%eventid)
        push('[SYSTEM]\n活动 #%d 结束\n#2300 : %d pt\n#11500 : %d pt\n#23000 : %d pt'%(
            eventid,predict['2300']['current'],predict['11500']['current'],predict['23000']['current']))
        raise SystemExit('活动结束')

    with sqlite3.connect('db/%d.db'%eventid) as db:
        db.execute('insert into line (time, t1pre, t1cur, t2pre, t2cur, t3pre, t3cur) values (?,?,?,?,?,?,?)', [
            int(datetime.datetime.now().timestamp()),
            predict['2300']['predict'], predict['2300']['current'],
            predict['11500']['predict'], predict['11500']['current'],
            predict['23000']['predict'], predict['23000']['current'],
        ])
        for ind,uid,name in follows:
            print(' == fetching score of #%d %s at place %d'%(uid,name,ind))
            details=_fetch_user_rank(ind,uid,eventid)

            if last_user_score[ind] is not None:
                last_lv, last_score, last_rank=last_user_score[ind]

                if details['score']!=last_score:
                    score_delta=details['score']-last_score
                    log('info','关注者 %s 分数变更:%d pt + %d pt → %d pt%s'%\
                        (name,last_score,score_delta,details['score'],score_parser(score_delta,' (%s)')))
                    if last_score and details['score']>0:
                        push('%s\n%s获得了 %d pt\n→ %d pt (#%d)'%\
                            (name,score_parser(score_delta,'进行了 %s\n'),score_delta,details['score'],details['rank']))

                if details['level']!=last_lv:
                    log('info','关注者 %s 等级变更:lv %d → lv %d'%(name,last_lv,details['level']))
                    if last_user_score[ind][0]>0 and details['level']>0:
                        push('%s\n升级到了 lv. %d'%(name,details['level']))

                if line_num(details['rank'])!=line_num(last_rank):
                    better_line=min(line_num(last_rank),line_num(details['rank']))
                    log('info','关注者 %s 档位变更:L%d → L%d (#%d)'%\
                        (name,line_num(last_rank),line_num(details['rank']),details['rank']))
                    if line_num(last_rank)>0 and line_num(details['rank'])>0:
                        push('%s\n%s了 %d 档\n当前排名:#%d'%\
                             (name,'离开' if better_line==line_num(last_rank) else '进入',better_line,details['rank']))

            last_user_score[ind]=(details['level'],details['score'],details['rank'])

            db.execute(
                'insert into follow%d (time,level,score,rank) values (?,?,?,?)'%ind,
                [int(datetime.datetime.now().timestamp()), details['level'], details['score'], details['rank']]
            )

    return eventid
Пример #37
0
 def load_db(self):
     self.sqlcon, self.sqlcursor = utils.init_db()