예제 #1
0
def insert_extract_info(vol, 
                        image_url, image_name, image_author, image_cita,
                        article_title, article_author, article_content,
                        qa_q_title, qa_q_content, qa_a_title, qa_a_content):
    res = True
    logger = get_logger()
    sql = '''
    insert into TJournalExtractInfo(
        vol,
        image_url, image_name, image_author, image_cita,
        article_title, article_author, article_content,
        qa_q_title, qa_q_content, qa_a_title, qa_a_content
    ) 
    values(?,?,?,?,?,?,?,?,?,?,?,?)'''
    conn = get_db_connect()
    try:
        logstr = 'sql: {} \t vol:{}'.format(sql, vol)
        logstr = ' '.join(logstr.split())
        logger.info(logstr)
        params = (vol, 
                  image_url, image_name, image_author, image_cita,
                  article_title, article_author, article_content,
                  qa_q_title, qa_q_content, qa_a_title, qa_a_content)
        conn.execute(sql, params)
        conn.commit()
    except Exception as ex:
        logger.error(ex)
        res = False
    finally:
        conn.close()
    return res
예제 #2
0
def get_one_journal(num):
    logger = get_logger()
    url = BASE_URL + 'vol.' + str(num)
    logger.info('url:%s', url)
    request = urllib.request.Request(url)
    request = fill_request(request)
    html = get_html(request)
    return html
예제 #3
0
def get_bytes(req):
    logger = get_logger()
    res_bytes = None
    try:
        response = urllib.request.urlopen(req, timeout=15)
        res_bytes = response.read()
    except Exception as ex:
        logger.error(ex)
        res_bytes = None
    return res_bytes
예제 #4
0
def download_image(url, path):
    logger = get_logger()
    logger.info('get image url:%s', url)
    request = urllib.request.Request(url)
    bytes = get_bytes(request)
    if bytes is None:
        logger.error('get image error')
        return False
    f = open(path, mode='bw')
    f.write(bytes)
    f.close()
    return True
예제 #5
0
def insert_journal_src(vol, html):
    res = True
    logger = get_logger()
    sql = 'insert into TJournalSrc(vol, html) values(?,?)'
    conn = get_db_connect()
    try:
        logger.info('sql: %s\t vol:%s', sql, vol)
        conn.execute(sql, (vol, html))
        conn.commit()
    except Exception as ex:
        logger.error(ex)
        res = False
    finally:
        conn.close()
    return res
예제 #6
0
    logger = get_logger()
    url = BASE_URL + 'vol.' + str(num)
    logger.info('url:%s', url)
    request = urllib.request.Request(url)
    request = fill_request(request)
    html = get_html(request)
    return html


def fill_request(request):
    for k in HEADERS:
        request.add_header(k, HEADERS[k])
    return request


if __name__ == '__main__':
    logger = get_logger()
    start_day = datetime.date(2012, 10, 8)
    today = datetime.date.today()
    vol = today - start_day + datetime.timedelta(days=2)
    vol = vol.days
    logger.info('range from %d to %d', 1, vol)
    for i in range(1, vol):
        logger.info('begin get vol %d', i)
        html = get_one_journal(i)
        if html is None:
            logger.error('get html error')
        else:
            insert_journal_src(i, html)
        time.sleep(5)