示例#1
0
def the_paper(url, params, headers):
    conn = pymysql.connect(
        host='0.0.0.0',
        port=3306,
        user='******',
        password='******',
        db='data_news',
        charset='utf8'
    )
    cur = conn.cursor()

    insert_sql = 'insert into `the_paper`(`id`, `weburl`, `title`, `contents`, `keyword`) VALUES (%s, %s, %s, %s, %s);'
    select_sql = 'select `weburl` from `the_paper`;'

    cur.execute(select_sql)
    result = cur.fetchall()
    inserted_url_list = [r[0] for r in result]

    s = requests.session()
    r = s.get(url=url, headers=headers, params=params)
    content = r.content.decode('utf-8', errors='ignore')
    html = etree.HTML(content)

    title_list = html.xpath('//div[@class="news_li"]//h2//a//text()')
    href_list = html.xpath('//div[@class="news_li"]//h2//a//@href')
    description_list = html.xpath('//div[@class="news_li"]//p//text()')

    for title, href, description in zip(title_list, href_list, description_list):
        href = 'http://www.thepaper.cn/' + href
        if href in inserted_url_list:
            print('已经插入过了!')
        else:
            try:
                q = s.get(href, headers=headers)
                text_content = q.content.decode('utf-8', errors='ignore')
                text_html = etree.HTML(text_content)
                main_content = re.findall(r'<div class="news_txt" data-size="standard">.*?<script>',
                                          text_content,
                                          re.M | re.S)
                main = re.sub(r'\n', '', ''.join(main_content).rstrip('<script>').encode('utf-8', errors='ignore').decode('utf-8', errors='ignore'))
                if main == '':
                    pass
                else:
                    try:
                        keywords = ','.join(str(text_html.xpath('//div[@class="news_keyword"]//text()')[0]).lstrip('关键词 >> ').split(' '))
                    except Exception:
                        keywords = ''
                    random_id = create_id()
                    sql_params = (random_id, href, title, main, keywords)
                    try:
                        cur.execute(insert_sql, sql_params)
                        conn.commit()
                        print('插入成功')
                    except Exception as e:
                        print(e)
                        conn.rollback()
            except Exception:
                pass
    conn.close()
示例#2
0
def netease_data_news():
    conn = pymysql.connect(host='0.0.0.0',
                           port=3306,
                           user='******',
                           password='******',
                           db='data_news',
                           charset='utf8')

    cur = conn.cursor()

    insert_sql = 'insert into `netease_data_news`(`id`, `title`, `weburl`, `createtime`, `contents`, `keyword`, `comment`) VALUES (%s, %s, %s, %s, %s, %s, %s);'

    select_sql = 'select `weburl` from `netease_data_news`;'

    cur.execute(select_sql)
    result = cur.fetchall()

    url_list = [r[0] for r in result]

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
    }
    s = requests.session()
    r = s.get('http://data.163.com/special/datablog/', headers=headers)
    content = r.content.decode('gbk', errors='ignore')
    html = etree.HTML(content)
    script = html.xpath('//script[contains(text(), "keyword")]//text()')[0]
    news_list = re.findall(
        r'{\s.*"url.*\s.*title.*\s.*img.*\s.*time.*\s.*comment.*\s.*"keyword":.*',
        script, re.M)[:10]

    for new in news_list:
        new = eval(str(new.rstrip(',').rstrip('}') + '}'))
        url = new['url']
        if url in url_list:
            print('这条数据已经在数据库中了')
        else:
            r = s.get(url, headers=headers)
            content = r.content.decode('gbk', errors='ignore')
            main = re.findall(r'<p.*?/p>', content, re.M | re.S)
            if len(main) == 0:
                pass
            else:
                article = ''
                main = main[2:-5]
                for p in main:
                    article += p
                random_id = create_id()
                params = (random_id, new['title'], url, new['time'], article,
                          new['keyword'], new['comment'])
                try:
                    cur.execute(insert_sql, params)
                    conn.commit()
                    print('插入成功')
                except Exception as e:
                    print(e)
                    conn.rollback()
    conn.close()
示例#3
0
def qc_spider(url, params, header):
    conn = pymysql.connect(host='0.0.0.0',
                           port=3306,
                           user='******',
                           password='******',
                           db='data_news',
                           charset='utf8')
    cur = conn.cursor()

    insert_sql = 'insert into `qc_news`(`id`, `weburl`, `title`, `contents`, `createtime`) VALUES (%s, %s, %s, %s, %s);'

    select_sql = 'select `weburl` from `qc_news`;'
    cur.execute(select_sql)
    results = cur.fetchall()
    inserted_url_list = [r[0] for r in results]

    s = requests.session()
    content = s.get(url=url, headers=header, params=params).content.decode(
        'utf-8', errors='ignore').lstrip('(').rstrip(')')
    json_content = json.loads(content, encoding='utf-8')
    article_list = json_content['data']['list']
    for article in article_list:
        title = article['Title']
        pub_time = article['PubTime']
        link_url = article['LinkUrl']
        if link_url in inserted_url_list:
            print('已经在库中了!')
        else:
            article_content = s.get(url=link_url,
                                    headers=header,
                                    params=params).content.decode(
                                        'utf-8', errors='ignore')
            main_content = ''.join(
                re.findall(r'<div class="article">.*?</div>', article_content,
                           re.S | re.M))
            if main_content == '':
                pass
            else:
                random_id = create_id()
                sql_params = (random_id, link_url, title, main_content,
                              pub_time)
                try:
                    cur.execute(insert_sql, sql_params)
                    conn.commit()
                    print('插入成功')
                except Exception as e:
                    print(e)
                    conn.rollback()
    conn.close()
示例#4
0
def create(event, context):
    slip_data = json.loads(event['body'])

    new_slip = SlipModel()
    new_slip.id = create_id()
    if 'number' in slip_data:
        new_slip.number = slip_data['number']
    else:  # If no number provided just generate one
        new_slip.number = random.randint(1, 100)

    # write the slip to the database
    new_slip.save()

    # create a response
    return {'statusCode': 201, 'body': json.dumps(dict(new_slip))}
示例#5
0
def create(event, context):
    boat_data = json.loads(event['body'])
    if 'name' not in boat_data:
        logging.error('Validation Failed')
        return {
            'statusCode': 422,
            'body':
            json.dumps({'error_message': 'Couldn\'t create the new boat.'})
        }

    if not boat_data['name']:
        logging.error('Validation Failed - boat name was empty. %s', boat_data)
        return {
            'statusCode':
            422,
            'body':
            json.dumps(
                {'error_message': 'Couldn\'t create the boat. Name was empty'})
        }

    new_boat = BoatModel()
    new_boat.id = create_id()
    if 'name' in boat_data:
        new_boat.name = boat_data['name']
    if 'type' in boat_data:
        new_boat.type = boat_data['type']
    if 'length' in boat_data:
        new_boat.length = boat_data['length']
    if 'at_sea' in boat_data:
        new_boat.at_sea = boat_data['at_sea']

    # write the new boat to the database
    new_boat.save()

    # create a response
    return {'statusCode': 201, 'body': json.dumps(dict(new_boat))}
示例#6
0
                        new_html, '//div[@class="qrcode-box"]/p/a//text()')[0]
                except Exception:
                    new_des = ''

                new_time = spider.sub_by_re(
                    r'n', '',
                    new_html.xpath('//p[@class="news-time"]//text()')[0])
                new_time = spider.sub_by_re(r' ', '', new_time)

                main_content = spider.get_content(
                    url=new_url,
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
                    },
                    params={},
                    proxies={},
                    cookies={})
                article_content = ''.join(
                    spider.find_by_re(
                        r'<div class="article_content">.*?</div>',
                        main_content, re.S | re.M))
                random_id = create_id()
                insert_sql = 'insert into `nbd_data_news`(`id`, `weburl`, `title`, `createtime`, `contents`, `description`) values (%s, %s, %s, %s, %s, %s);'
                params = (random_id, new_url, new_title, new_time,
                          article_content, new_des)
                spider.save_into_table(insert_sql, params)
                time.sleep(2)
        print('等待更新....')
        time.sleep(86400)
示例#7
0
def sohu_news():
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
    }

    conn = pymysql.connect(host='0.0.0.0',
                           port=3306,
                           user='******',
                           password='******',
                           db='data_news',
                           charset='utf8')

    cur = conn.cursor()

    insert_sql = 'insert into `sohu_news`(`id`, `weburl`, `title`, `contents`) VALUES (%s, %s, %s, %s);'

    select_sql = 'select `weburl` from `sohu_news`;'
    cur.execute(select_sql)
    result = cur.fetchall()

    exist_url_list = [r[0] for r in result]

    s = requests.session()
    url = 'http://mp.sohu.com/apiV2/profile/newsListAjax'
    # for i in range(1, 29):
    params = {
        'xpt': 'NzJCMERBNUNDN0NEODJBOTkwMTZFMkM2NkU3REM3QjBAcXEuc29odS5jb20=',
        'pageNumber': 1,
        'pageSize': 10,
    }
    content = s.get(url=url, headers=headers, params=params).json()
    with open('data.json', 'w') as f:
        f.write(content)
    with open('data.json', 'r') as f:
        data = json.load(f)
    for article in data['data']:
        article_url = 'http:' + article['url']
        if article_url in exist_url_list:
            print('已经在库中了!')
        else:
            article_title = unquote(article['title'])
            r = s.get(url=article_url, headers=headers)
            article_content = r.content.decode('utf-8', errors='ignore')
            main_content = re.findall(
                r'<article class="article" id="mp-editor">.*?</article>',
                article_content, re.M | re.S)
            main = ''
            for c in main_content:
                main += c
            random_id = create_id()
            sql_params = (random_id, article_url, article_title, main)
            print(main)
            try:
                cur.execute(insert_sql, sql_params)
                conn.commit()
                print("插入成功")
            except Exception as e:
                print(e)
                conn.rollback()
    conn.close()