class NewsCounter: def __init__(self,sleep_time = 60): self.counter_thread = threading.Thread(target = self.run, name = 'Counter') self.recent_data = None self.sleep_time = sleep_time self._init_counter() def _init_counter(self): self.engine_obj = DatabaseEngine('xiaoyi') self.engine = self.engine_obj.get_engine() self.session = self.engine_obj.get_session() def start(self): self.counter_thread.start() def join(self): self.counter_thread.join() def run(self): print 'News Counter starts to run...' counter_birth_time = time.time() while time.time() - counter_birth_time < SPIDER_LIVE_TIME: self.recent_data = pd.read_sql(SQL_GET_MACRO_NEWS_TOP,self.engine) self.recent_data_group = self.recent_data.groupby('unique_flag', sort = False) self.news_repeat = self.recent_data_group.count() self.news_repeat = self.news_repeat.reset_index() self.news_repeat.rename(columns = {'news_source':'repeat_times'}, inplace = True) self.news_repeat.drop_duplicates(subset = ['unique_flag'], inplace = True) # write self.news_repeat.to_sql('news_repeat',self.engine,index = False, if_exists = 'replace', dtype = {'unique_flag':VARCHAR(256), 'repeat_times':INT}) self.session.execute(SQL_MERGE_MACRO_NEWS_REPEAT_TIMES) self.session.commit() time.sleep(self.sleep_time)
import pandas as pd from database_engine import DatabaseEngine from SQLs import ( SQL_GET_SW_INDEX_CLOSE, SQL_GET_FUND_HOLD_STOCK, SQL_GET_SW_INDEX_STOCK_COMPONENTS, SQL_GET_FUND_INFER_INDUSTRY, SQL_GET_FUND_BASIC, SQL_GET_FUND_SCORE, SQL_GET_FUND_NET_VALUE, SQL_GET_TRADE_CALENDAR, SQL_GET_FUND_INFER_INDUSTRY_CURRENT, SQL_GET_FUNDS_NET_VALUE, SQL_GET_FUND_MANAGER, SQL_GET_MANAGER_FUND, SQL_GET_STOCK_BASICS, SQL_GET_STOCK_DAILY_DATA1, SQL_GET_STOCK_DAILY_DATA2, SQL_GET_STOCK_MIN_CLOSE, SQL_GET_STOCK_FEATURES, SQL_GET_FUNDS_DAILY_RET, SQL_GET_ALL_FUNDS_DAILY_RET, SQL_GET_INDEX_DAILY, SQL_GET_RISK_FREE_RATE, SQL_GET_STOCK_ADJFACTOR) from consts import SW_INDUSTRY_FIRST_CODE engine_ld_obj = DatabaseEngine('ld') engine_ld = engine_ld_obj.get_engine() engine_xiaoyi_obj = DatabaseEngine('xiaoyi') engine_xiaoyi = engine_xiaoyi_obj.get_engine() session_xiaoyi = engine_xiaoyi_obj.get_session() engine_wind_obj = DatabaseEngine('wind') engine_wind = engine_wind_obj.get_engine() engine_gb_obj = DatabaseEngine('geniusbar') engine_gb = engine_gb_obj.get_engine() session_gb = engine_gb_obj.get_session() engine_xiaoyi40_obj = DatabaseEngine('xiaoyi40') engine_xiaoyi40 = engine_xiaoyi40_obj.get_engine() session_xiaoyi40 = engine_xiaoyi40_obj.get_session()
class NewsSpider(object): def __init__(self,spider_name,source_name,title_url,lock, update_seconds = 30): ''' 新闻爬虫工厂。 生产流程: 1. 实现parse_title生成self.additions 2. 实现parse_content,将相关内容添加到self.additions. Parameters ---------- spider_name 爬虫名称,英文 source_name 新闻源名称,unicode title_url 标题页url lock 线程共享锁 update_seconds int,基础更新间隔 if_header int,0 或 1 是否头条爬虫 ''' self.spider_name = spider_name self.source_name = source_name self.title_url = title_url self.lock = lock self.update_seconds = update_seconds self.if_header = 0 def init_spider(self,if_connect = True): self.header_generator = header_generator if if_connect: self.engine_obj = DatabaseEngine('xiaoyi') self.engine = self.engine_obj.get_engine() self.session = self.engine_obj.get_session() self.additions = pd.DataFrame() self.spider_thread = threading.Thread(target = self.spider_data, name = self.spider_name) # 最近更新标记 self.last_flag = None # 最近更新,用来进行增量判断,可以是时间等 print 'Spider %s initilize succesfully'%self.spider_name def spider_data(self): print 'Spider %s starts to run...'%self.spider_name spider_birth_time = time.time() while time.time() - spider_birth_time < SPIDER_LIVE_TIME: # 获取标题页 try: status_code = self._get_titles_response() if status_code != 200: continue # ----------- 标题页分析,确定增量更新内容self.additions ----------- # additions.columns = [title,href,news_time,content,news_source] # 保证additions都有上述的column,没有内容的用None try: self._parse_titles_response() except Exception as e: print 'Spider %s failed to parse TITLE'%self.spider_name print 'Error:' print e print '*' * 40 self._have_a_short_rest() continue # ------------------------------------------------------------------ if len(self.additions) == 0: self._have_a_rest() continue for idx,row in self.additions.iterrows(): href = row['href'] status_code = self._get_content_response(href) if status_code != 200: self._have_a_short_rest() continue try: self._parse_content_response(idx) except Exception as e: print 'Spider %s failed to parse content'%self.spider_name print 'The href is %s'%href print 'Error:' print e print '*' * 40 self._add_unique_flag() self._check_additions() if len(self.additions) == 0: self._have_a_rest() continue self._add_header_flag() self._write_into_db() self._have_a_rest() except Exception as e: print '[%s]Error in %s'%(dt.datetime.today().strftime('%Y-%m-%d %H:%M:%S'), self.spider_name) print 'Error: %s'%e break def _get_titles_response(self): ''' 获取标题页respsonse. ''' self.titles_response = requests.get(self.title_url,headers = self.header_generator()) return self.titles_response.status_code def _get_content_response(self,href): ''' 获取新闻具体内容response. ''' try: self.content_response = requests.get(href,headers = self.header_generator()) return self.content_response.status_code except Exception as e: print 'Spider {sp} failed to get content from {web} because: {error}'.format( sp = self.spider_name,web = href,error = str(e)) return 0 def _parse_titles_response(self): ''' 分析标题页response,返回additions(增量更新内容). 包括标题、新闻链接、时间(可选)、来源. ''' pass def _parse_content_response(self,idx): ''' 解析内容页response. ''' pass def _check_additions(self): ''' 写入数据库前对self.additions内容进行检测,确保没有问题. ''' # 剔除title重复内容 self.additions = self.additions.sort_values('news_time', ascending = False) self.additions = self.additions.drop_duplicates(subset = 'unique_flag') # 剔除空内容 self.additions = self.additions.dropna(subset = ['content']) # 剔除content数据量太少的记录 self.additions['content_len'] = self.additions['content'].apply(lambda x: len(x)) self.additions = self.additions.loc[self.additions['content_len'] > 3] self.additions = self.additions.drop('content_len',axis = 1) def _add_unique_flag(self): ''' 为新闻内容添加唯一性标志.实现新闻去重. ''' punctuation_list = [u'?',u'.',u',',u'!',u':',u'"',u' ',u'…',u'%',u'<',u'>', u'?', u'。',u',',u'!',u':',u'”',u'“',u'、',u'《',u'\u3000' u'》'] def delete_punctuations(title): new_title = title for punc in punctuation_list: new_title = new_title.replace(punc,'') return new_title self.additions['unique_flag'] = self.additions['title'].apply(delete_punctuations) def _add_header_flag(self): ''' 为新闻内容添加是否头条的字段. ''' self.additions.loc[:,'if_header'] = self.if_header def _write_into_db(self): ''' 写入数据库. ''' self.lock.acquire() # 防止同一时间有其他进程对数据库进行写入 # --------------------- # 1. self.additions.to_sql('news_macro_source',self.engine, index = False,if_exists = 'replace', dtype = {'title':VARCHAR(256), 'news_time':VARCHAR(64), 'href':VARCHAR(256), 'content':NVARCHAR(), 'update_datetime':DATETIME, 'news_source':VARCHAR(32), 'unique_flag':VARCHAR(256), 'if_header':INT}) try: if not self.if_header: self.session.execute(SQL_MERGE_MACRO_NEWS) self.session.commit() else: self.session.execute(SQL_MERGE_MACRO_NEWS_FOR_HEADER) self.session.commit() self.session.execute(SQL_MERGE_MACRO_NEWS_FOR_ALL) self.session.commit() except pymssql.OperationalError: print 'Error of %s: Writing into database failed'%self.spider_name # --------------------- self.lock.release() # 释放 def _have_a_rest(self): time.sleep(self.update_seconds + random.randint(0,60)) def _have_a_short_rest(self): time.sleep(3) def start(self): self.spider_thread.start() def join(self): self.spider_thread.join() def test_spider(self,if_write = False): self.header_generator = header_generator self.last_flag = None self.additions = pd.DataFrame() status_code = self._get_titles_response() print 'TITLE RESPONSE STATUS: %s'%(str(status_code)) # ----------- 标题页分析,确定增量更新内容self.additions ----------- # additions.columns = [title,href,news_time,content,news_source] # 保证additions都有上述的column,没有内容的用None self._parse_titles_response() # ------------------------------------------------------------------ for idx,row in self.additions.iterrows(): href = row['href'] status_code = self._get_content_response(href) print 'CONTENT RESPONSE STATUS: %s'%(str(status_code)) self._parse_content_response(idx) if if_write: self.engine_obj = DatabaseEngine('xiaoyi') self.engine = self.engine_obj.get_engine() self.session = self.engine_obj.get_session() self.additions.to_sql('news_macro_source',self.engine, index = False,if_exists = 'replace', dtype = {'title':VARCHAR(256), 'news_time':VARCHAR(64), 'href':VARCHAR(256), 'content':NVARCHAR(), 'update_datetime':DATETIME, 'news_source':VARCHAR(32)}) self.session.execute(SQL_MERGE_MACRO_NEWS) self.session.commit() self.last_flag = None
punctuation_list = ['\xa0'.decode('latin1'),u'?',u'.',u',',u'!',u':',u'"',u' ',u'…',u'%',u'<',u'>', u'?', u'。',u',',u'!',u':',u'”',u'“',u'、',u'《', u'》'] def delete_punctuations(title): new_title = title for punc in punctuation_list: new_title = new_title.replace(punc,'') return new_title data['unique_flag'] = data['title'].apply(delete_punctuations) data = data[['title','unique_flag']] engine_obj = DatabaseEngine('xiaoyi') engine = engine_obj.get_engine() session = engine_obj.get_session() data.to_sql('news_macro_source1',engine, index = False,if_exists = 'replace', dtype = {'title':VARCHAR(256), 'unique_flag':VARCHAR(256)}) tmp_sql = ''' MERGE INTO news_macro as T USING news_macro_source1 as S ON T.title = S.title WHEN MATCHED THEN UPDATE SET unique_flag = S.unique_flag; '''
# -*- coding: utf-8 -*- """ Created on Wed Feb 13 10:23:56 2019 @author: ldh """ # api.py from __future__ import print_function from database_engine import DatabaseEngine engine_guosen_obj = DatabaseEngine('guosen') engine_guosen = engine_guosen_obj.get_engine() session_guosen = engine_guosen_obj.get_session() def index_daily(index_code, start_date, end_date): ''' 指数日行情。 Parameters ---------- index_code 指数代码 start_date 开始日期 end_date 结束日期 Returns
def collect_data_from_ths(): engine_obj = DatabaseEngine('xiaoyi') engine = engine_obj.get_engine() session = engine_obj.get_session() url_ths = 'http://news.10jqka.com.cn/cjzx_list/index_1.shtml' last_news_time = None print 'THS collecter starts RUNNING...' while True: # 标题页数据获取 response = requests.get(url_ths, headers=header_generator()) if response.status_code == 200: pass else: time.sleep(retry_seconds + random.randint(0, 30)) continue # 标题页数据解析 soup = BeautifulSoup(response.content, 'html.parser') content = soup.find_all(class_='content-1200') content = content[1] content = content.find_all(class_='module-l fl') content = content[0] content = content.find_all(class_='list-con') content = content[0] content_titles = content.find_all(class_='arc-title') titles = [] news_time = [] hrefs = [] for each in content_titles: tmp = each.find('a') titles.append(tmp.attrs['title']) news_time.append(each.find('span').text) hrefs.append(tmp.attrs['href']) news_time_adj = map(ths_news_time_convertor, news_time) # 判断是否有新的新闻写入 news_df = pd.DataFrame([titles, news_time_adj, hrefs], index=['title', 'news_time', 'href']).T news_df['news_source'] = u'同花顺' if last_news_time is None: last_news_time = max(news_time_adj) additions = news_df else: latest_news_time = max(news_time_adj) if latest_news_time <= last_news_time: time.sleep(update_seconds + random.randint(0, 30)) continue # 无更新内容 else: # 确定增量更新内容 additions = news_df.loc[news_df['news_time'] > last_news_time] additions['content'] = None last_news_time = latest_news_time # 增量爬取 for idx, row in additions.iterrows(): link = row['href'] link_response = requests.get(link, headers=header_generator()) if link_response.status_code == 404: continue tmp_soup = BeautifulSoup(link_response.content, 'html.parser') main_text = tmp_soup.find_all(class_='main-text atc-content') main_text = main_text[0] main_text_list = main_text.find_all('p') main_text_content = [each.text for each in main_text_list][1:-1] tmp_content = '\n'.join(main_text_content) last_stop_point = tmp_content.rfind(u'。') filtered_content = tmp_content[:last_stop_point + 1] additions.loc[idx, 'content'] = filtered_content # 爬取内容增量写入数据库 additions['update_datetime'] = dt.datetime.today() lock.acquire() # 防止同一时间有其他进程对数据库进行写入 additions.to_sql('news_macro_source', engine, index=False, if_exists='replace', dtype={ 'title': VARCHAR(256), 'news_time': VARCHAR(64), 'href': VARCHAR(256), 'content': TEXT, 'update_datetime': DATETIME, 'news_source': VARCHAR(32) }) session.execute(SQL_MERGE_MACRO_NEWS) session.commit() lock.release() # 休眠 time.sleep(update_seconds + random.randint(0, 30))
def collect_data_from_jrj(): ''' 金融界数据爬取. ''' engine_obj = DatabaseEngine('xiaoyi') engine = engine_obj.get_engine() session = engine_obj.get_session() url_jrj = 'http://finance.jrj.com.cn/list/guoneicj.shtml' last_news_title = None print 'JRJ collecter starts RUNNING...' while True: # 获取标题页 response = requests.get(url_jrj, headers=header_generator()) if response.status_code == 200: pass else: time.sleep(retry_seconds + random.randint(0, 30)) continue soup = BeautifulSoup(response.content, 'html.parser') content = soup.find('ul', class_='list2') # 标题页解析 titles = [] hrefs = [] content_list = content.find_all('li') for tag in content_list: if 'class' in tag.attrs.keys(): continue tag1 = tag.find('a') titles.append(tag1.text) hrefs.append(tag1.attrs['href']) news_df = pd.DataFrame([titles, hrefs], index=['title', 'href']).T news_df['news_source'] = u'金融界' # 判断是否有新的新闻写入(根据标题) if last_news_title is None: additions = news_df else: title_list = news_df['title'].tolist() try: last_title_idx = title_list.index(last_news_title) if last_title_idx == 0: time.sleep(update_seconds + random.randint(0, 30)) continue additions = news_df.loc[:last_title_idx] except: additions = news_df last_news_title = additions['title'].iloc[0] additions['content'] = None additions['news_time'] = None # 增量爬取 for idx, row in additions.iterrows(): link = row['href'] link_response = requests.get(link, headers=header_generator()) if link_response.status_code == 404: continue tmp_soup = BeautifulSoup(link_response.content, 'html.parser') # 获取文本 main_text = tmp_soup.find('div', class_='texttit_m1') filtered_content = main_text.text last_stop_point = filtered_content.rfind(u'。') filtered_content = filtered_content[:last_stop_point + 1] additions.loc[idx, 'content'] = filtered_content # 获取时间 news_time = tmp_soup.find( text=re.compile(r'\d{4}-\d{2}-\d{2}'))[2:] additions.loc[idx, 'news_time'] = news_time[:-3] # 爬取内容增量写入数据库 additions['update_datetime'] = dt.datetime.today() lock.acquire() # 防止同一时间有其他进程对数据库进行写入 additions.to_sql('news_macro_source', engine, index=False, if_exists='replace', dtype={ 'title': VARCHAR(256), 'news_time': VARCHAR(64), 'href': VARCHAR(256), 'content': TEXT, 'update_datetime': DATETIME, 'news_source': VARCHAR(32) }) session.execute(SQL_MERGE_MACRO_NEWS) session.commit() lock.release() # 休眠 time.sleep(update_seconds + random.randint(0, 30))
def collect_data_from_cnfol(): ''' 中金在线数据爬取. ''' engine_obj = DatabaseEngine('xiaoyi') engine = engine_obj.get_engine() session = engine_obj.get_session() url_cnfol = 'http://news.cnfol.com/guoneicaijing/' last_news_time = None print 'CNFOL collecter starts RUNNING...' while True: # 获取标题页 response = requests.get(url_cnfol, headers=header_generator()) if response.status_code == 200: pass else: time.sleep(retry_seconds + random.randint(0, 30)) continue soup = BeautifulSoup(response.content, 'html.parser') content = soup.find('ul', class_='TList') # 标题页解析 titles = [] hrefs = [] news_time = [] content_list = content.find_all('li') for tag in content_list: tag1 = tag.find('a') titles.append(tag1.attrs['title']) hrefs.append(tag1.attrs['href']) news_time.append(cnfol_news_time_convertor(tag.find('span').text)) news_df = pd.DataFrame([titles, news_time, hrefs], index=['title', 'news_time', 'href']).T news_df['news_source'] = u'中金在线' # 判断是否有新的新闻写入 if last_news_time is None: last_news_time = max(news_time) additions = news_df else: latest_news_time = max(news_time) if latest_news_time <= last_news_time: time.sleep(update_seconds + random.randint(0, 30)) continue # 无更新内容 else: # 确定增量更新内容 additions = news_df.loc[news_df['news_time'] > last_news_time] additions['content'] = None last_news_time = latest_news_time # 增量爬取 for idx, row in additions.iterrows(): link = row['href'] link_response = requests.get(link, headers=header_generator()) if link_response.status_code == 404: continue tmp_soup = BeautifulSoup(link_response.content, 'html.parser') main_text = tmp_soup.find('div', class_='Article') filtered_content = main_text.text additions.loc[idx, 'content'] = filtered_content # 爬取内容增量写入数据库 additions['update_datetime'] = dt.datetime.today() lock.acquire() # 防止同一时间有其他进程对数据库进行写入 additions.to_sql('news_macro_source', engine, index=False, if_exists='replace', dtype={ 'title': VARCHAR(256), 'news_time': VARCHAR(64), 'href': VARCHAR(256), 'content': TEXT, 'update_datetime': DATETIME, 'news_source': VARCHAR(32) }) session.execute(SQL_MERGE_MACRO_NEWS) session.commit() lock.release() # 休眠 time.sleep(update_seconds + random.randint(0, 30))