def the_paper(url, params, headers): conn = pymysql.connect( host='0.0.0.0', port=3306, user='******', password='******', db='data_news', charset='utf8' ) cur = conn.cursor() insert_sql = 'insert into `the_paper`(`id`, `weburl`, `title`, `contents`, `keyword`) VALUES (%s, %s, %s, %s, %s);' select_sql = 'select `weburl` from `the_paper`;' cur.execute(select_sql) result = cur.fetchall() inserted_url_list = [r[0] for r in result] s = requests.session() r = s.get(url=url, headers=headers, params=params) content = r.content.decode('utf-8', errors='ignore') html = etree.HTML(content) title_list = html.xpath('//div[@class="news_li"]//h2//a//text()') href_list = html.xpath('//div[@class="news_li"]//h2//a//@href') description_list = html.xpath('//div[@class="news_li"]//p//text()') for title, href, description in zip(title_list, href_list, description_list): href = 'http://www.thepaper.cn/' + href if href in inserted_url_list: print('已经插入过了!') else: try: q = s.get(href, headers=headers) text_content = q.content.decode('utf-8', errors='ignore') text_html = etree.HTML(text_content) main_content = re.findall(r'<div class="news_txt" data-size="standard">.*?<script>', text_content, re.M | re.S) main = re.sub(r'\n', '', ''.join(main_content).rstrip('<script>').encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')) if main == '': pass else: try: keywords = ','.join(str(text_html.xpath('//div[@class="news_keyword"]//text()')[0]).lstrip('关键词 >> ').split(' ')) except Exception: keywords = '' random_id = create_id() sql_params = (random_id, href, title, main, keywords) try: cur.execute(insert_sql, sql_params) conn.commit() print('插入成功') except Exception as e: print(e) conn.rollback() except Exception: pass conn.close()
def netease_data_news(): conn = pymysql.connect(host='0.0.0.0', port=3306, user='******', password='******', db='data_news', charset='utf8') cur = conn.cursor() insert_sql = 'insert into `netease_data_news`(`id`, `title`, `weburl`, `createtime`, `contents`, `keyword`, `comment`) VALUES (%s, %s, %s, %s, %s, %s, %s);' select_sql = 'select `weburl` from `netease_data_news`;' cur.execute(select_sql) result = cur.fetchall() url_list = [r[0] for r in result] headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } s = requests.session() r = s.get('http://data.163.com/special/datablog/', headers=headers) content = r.content.decode('gbk', errors='ignore') html = etree.HTML(content) script = html.xpath('//script[contains(text(), "keyword")]//text()')[0] news_list = re.findall( r'{\s.*"url.*\s.*title.*\s.*img.*\s.*time.*\s.*comment.*\s.*"keyword":.*', script, re.M)[:10] for new in news_list: new = eval(str(new.rstrip(',').rstrip('}') + '}')) url = new['url'] if url in url_list: print('这条数据已经在数据库中了') else: r = s.get(url, headers=headers) content = r.content.decode('gbk', errors='ignore') main = re.findall(r'<p.*?/p>', content, re.M | re.S) if len(main) == 0: pass else: article = '' main = main[2:-5] for p in main: article += p random_id = create_id() params = (random_id, new['title'], url, new['time'], article, new['keyword'], new['comment']) try: cur.execute(insert_sql, params) conn.commit() print('插入成功') except Exception as e: print(e) conn.rollback() conn.close()
def qc_spider(url, params, header): conn = pymysql.connect(host='0.0.0.0', port=3306, user='******', password='******', db='data_news', charset='utf8') cur = conn.cursor() insert_sql = 'insert into `qc_news`(`id`, `weburl`, `title`, `contents`, `createtime`) VALUES (%s, %s, %s, %s, %s);' select_sql = 'select `weburl` from `qc_news`;' cur.execute(select_sql) results = cur.fetchall() inserted_url_list = [r[0] for r in results] s = requests.session() content = s.get(url=url, headers=header, params=params).content.decode( 'utf-8', errors='ignore').lstrip('(').rstrip(')') json_content = json.loads(content, encoding='utf-8') article_list = json_content['data']['list'] for article in article_list: title = article['Title'] pub_time = article['PubTime'] link_url = article['LinkUrl'] if link_url in inserted_url_list: print('已经在库中了!') else: article_content = s.get(url=link_url, headers=header, params=params).content.decode( 'utf-8', errors='ignore') main_content = ''.join( re.findall(r'<div class="article">.*?</div>', article_content, re.S | re.M)) if main_content == '': pass else: random_id = create_id() sql_params = (random_id, link_url, title, main_content, pub_time) try: cur.execute(insert_sql, sql_params) conn.commit() print('插入成功') except Exception as e: print(e) conn.rollback() conn.close()
def create(event, context): slip_data = json.loads(event['body']) new_slip = SlipModel() new_slip.id = create_id() if 'number' in slip_data: new_slip.number = slip_data['number'] else: # If no number provided just generate one new_slip.number = random.randint(1, 100) # write the slip to the database new_slip.save() # create a response return {'statusCode': 201, 'body': json.dumps(dict(new_slip))}
def create(event, context): boat_data = json.loads(event['body']) if 'name' not in boat_data: logging.error('Validation Failed') return { 'statusCode': 422, 'body': json.dumps({'error_message': 'Couldn\'t create the new boat.'}) } if not boat_data['name']: logging.error('Validation Failed - boat name was empty. %s', boat_data) return { 'statusCode': 422, 'body': json.dumps( {'error_message': 'Couldn\'t create the boat. Name was empty'}) } new_boat = BoatModel() new_boat.id = create_id() if 'name' in boat_data: new_boat.name = boat_data['name'] if 'type' in boat_data: new_boat.type = boat_data['type'] if 'length' in boat_data: new_boat.length = boat_data['length'] if 'at_sea' in boat_data: new_boat.at_sea = boat_data['at_sea'] # write the new boat to the database new_boat.save() # create a response return {'statusCode': 201, 'body': json.dumps(dict(new_boat))}
new_html, '//div[@class="qrcode-box"]/p/a//text()')[0] except Exception: new_des = '' new_time = spider.sub_by_re( r'n', '', new_html.xpath('//p[@class="news-time"]//text()')[0]) new_time = spider.sub_by_re(r' ', '', new_time) main_content = spider.get_content( url=new_url, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, params={}, proxies={}, cookies={}) article_content = ''.join( spider.find_by_re( r'<div class="article_content">.*?</div>', main_content, re.S | re.M)) random_id = create_id() insert_sql = 'insert into `nbd_data_news`(`id`, `weburl`, `title`, `createtime`, `contents`, `description`) values (%s, %s, %s, %s, %s, %s);' params = (random_id, new_url, new_title, new_time, article_content, new_des) spider.save_into_table(insert_sql, params) time.sleep(2) print('等待更新....') time.sleep(86400)
def sohu_news(): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } conn = pymysql.connect(host='0.0.0.0', port=3306, user='******', password='******', db='data_news', charset='utf8') cur = conn.cursor() insert_sql = 'insert into `sohu_news`(`id`, `weburl`, `title`, `contents`) VALUES (%s, %s, %s, %s);' select_sql = 'select `weburl` from `sohu_news`;' cur.execute(select_sql) result = cur.fetchall() exist_url_list = [r[0] for r in result] s = requests.session() url = 'http://mp.sohu.com/apiV2/profile/newsListAjax' # for i in range(1, 29): params = { 'xpt': 'NzJCMERBNUNDN0NEODJBOTkwMTZFMkM2NkU3REM3QjBAcXEuc29odS5jb20=', 'pageNumber': 1, 'pageSize': 10, } content = s.get(url=url, headers=headers, params=params).json() with open('data.json', 'w') as f: f.write(content) with open('data.json', 'r') as f: data = json.load(f) for article in data['data']: article_url = 'http:' + article['url'] if article_url in exist_url_list: print('已经在库中了!') else: article_title = unquote(article['title']) r = s.get(url=article_url, headers=headers) article_content = r.content.decode('utf-8', errors='ignore') main_content = re.findall( r'<article class="article" id="mp-editor">.*?</article>', article_content, re.M | re.S) main = '' for c in main_content: main += c random_id = create_id() sql_params = (random_id, article_url, article_title, main) print(main) try: cur.execute(insert_sql, sql_params) conn.commit() print("插入成功") except Exception as e: print(e) conn.rollback() conn.close()