示例#1
0
def crawler_and_save(serial_number, save=False):
    # Send
    lock.acquire()  # TODO
    session = HTMLSession()
    response = session.get(url=football_news_url.format(serial_number))
    session.close()
    lock.release()  # TODO

    # Fail
    if response.status_code != 200:
        result = None

    # Success
    page_text = response.text

    # Save
    if save:
        with open(local_file_path.format(serial_number), 'w', encoding='UTF-8') as file:
            file.write(page_text)

    news = FootballNews(id=str(uuid.uuid4()).upper(), serial_number=serial_number, news_type=0)
    # Analyse
    try:
        area = response.html.find('.new-area')[0]
        news.news_type = news_type_dictionary.get(area.find('span')[0].text.strip())
        try:
            news.create_time = datetime.datetime.strptime(area.find('span')[1].text[6:].strip(), datetime_format)
        except Exception as e:
            news.create_time = datetime.datetime.now()
        news.title = area.find('.new-title')[0].text.strip()
        news.content = ''
        for p in area.find('.new-content')[0].find('p'):
            news.content = news.content + p.text.strip()
        news.tags = []
        for a in area.find('.new-tags')[0].find('a'):
            news.tags.append(a.text.strip())
        news.tags = ','.join(news.tags)
    except Exception as e:
        news = None

    def get_session():
        # 初始化数据库连接:
        engine = create_engine('mysql://*****:*****@47.94.84.81:3306/test_mysql')
        # 创建DBSession类型:
        DBSession = sessionmaker(bind=engine)
        # 创建session对象:
        session = DBSession()
        return session

    def close_session(session):
        session.close()

    if news is not None:
        try:
            session = get_session()
            session.add(news)
            session.commit()
            close_session(session)
            print('serial_number={},写入完毕!'.format(serial_number))
        except Exception as e:
            print('serial_number={},Error: 向MySQL写入数据失败!'.format(serial_number))
    else:
        print('serial_number={},Error: 当前页面不存在!'.format(serial_number))