def insert_extract_info(vol, image_url, image_name, image_author, image_cita, article_title, article_author, article_content, qa_q_title, qa_q_content, qa_a_title, qa_a_content): res = True logger = get_logger() sql = ''' insert into TJournalExtractInfo( vol, image_url, image_name, image_author, image_cita, article_title, article_author, article_content, qa_q_title, qa_q_content, qa_a_title, qa_a_content ) values(?,?,?,?,?,?,?,?,?,?,?,?)''' conn = get_db_connect() try: logstr = 'sql: {} \t vol:{}'.format(sql, vol) logstr = ' '.join(logstr.split()) logger.info(logstr) params = (vol, image_url, image_name, image_author, image_cita, article_title, article_author, article_content, qa_q_title, qa_q_content, qa_a_title, qa_a_content) conn.execute(sql, params) conn.commit() except Exception as ex: logger.error(ex) res = False finally: conn.close() return res
def get_one_journal(num): logger = get_logger() url = BASE_URL + 'vol.' + str(num) logger.info('url:%s', url) request = urllib.request.Request(url) request = fill_request(request) html = get_html(request) return html
def get_bytes(req): logger = get_logger() res_bytes = None try: response = urllib.request.urlopen(req, timeout=15) res_bytes = response.read() except Exception as ex: logger.error(ex) res_bytes = None return res_bytes
def download_image(url, path): logger = get_logger() logger.info('get image url:%s', url) request = urllib.request.Request(url) bytes = get_bytes(request) if bytes is None: logger.error('get image error') return False f = open(path, mode='bw') f.write(bytes) f.close() return True
def insert_journal_src(vol, html): res = True logger = get_logger() sql = 'insert into TJournalSrc(vol, html) values(?,?)' conn = get_db_connect() try: logger.info('sql: %s\t vol:%s', sql, vol) conn.execute(sql, (vol, html)) conn.commit() except Exception as ex: logger.error(ex) res = False finally: conn.close() return res
logger = get_logger() url = BASE_URL + 'vol.' + str(num) logger.info('url:%s', url) request = urllib.request.Request(url) request = fill_request(request) html = get_html(request) return html def fill_request(request): for k in HEADERS: request.add_header(k, HEADERS[k]) return request if __name__ == '__main__': logger = get_logger() start_day = datetime.date(2012, 10, 8) today = datetime.date.today() vol = today - start_day + datetime.timedelta(days=2) vol = vol.days logger.info('range from %d to %d', 1, vol) for i in range(1, vol): logger.info('begin get vol %d', i) html = get_one_journal(i) if html is None: logger.error('get html error') else: insert_journal_src(i, html) time.sleep(5)