Пример #1
0
def get_rate(conn):
    """
    美元汇率查看
    :param conn:
    :return:
    """
    now = datetime.datetime.now()
    current_time = now.strftime("%Y-%m-%d %H:%M:%S")
    stamp_current_time = datetime.datetime.strptime(current_time,
                                                    '%Y-%m-%d %H:%M:%S')
    c = CurrencyRates()
    c.get_rates('USD')  # 查看美元最新汇率
    stock_price = c.get_rate('USD', 'CNY')  # 人民币汇率
    stock_time = str(current_time)
    stock_name = '人民币汇率'
    stock_applies = None
    spider_data = stamp_current_time
    # 更新美元汇率
    sql_update = """
    UPDATE public.stock_code SET stock_time = %s, spider_data = %s, stock_name= %s, stock_price= %s, stock_applies= %s
    WHERE stock_name = '人民币汇率'
    """
    sql_params = [
        stock_time, spider_data, stock_name, stock_price, stock_applies
    ]
    logger.debug(sql_update)
    logger.debug(sql_params)
    execute_sql(conn, sql_update, sql_params)
Пример #2
0
def main():
    now = datetime.datetime.now()
    current_time = now.strftime("%Y-%m-%d %H:%M:%S")
    one_day = now - datetime.timedelta(days=1)
    one_day_ago = one_day.strftime('%Y-%m-%d %H:%M:%S')[:16]
    history_day = (now - datetime.timedelta(days=10)).strftime('%Y-%m-%d %H:%M:%S')[:16]
    conn = None
    try:
        conn = get_conn()
        with conn:
            sql_max_date = """
                SELECT max(CASE WHEN news_source='新浪财经' THEN news_date END),
                    max(CASE WHEN news_source='华尔街见闻' THEN news_date END)
                FROM news_cj
            """
            res = execute_select(conn, sql_max_date)
            max_date_sina = res[0][0] if res[0][0] else one_day_ago
            max_date_news = res[0][1] if res[0][1] else one_day_ago
            sql_delete = """
                DELETE FROM news_cj
                WHERE news_date <= %s  
                    OR (news_source='华尔街见闻' AND news_date=%s) OR (news_source='新浪财经' AND news_date=%s)
            """
            execute_sql(conn, sql_delete, (history_day, max_date_news, max_date_sina))
            get_news(conn, max_date_news, current_time)
            get_sina_news(conn, max_date_sina, current_time)
    except Exception as e:
        logger.error(str(e))
    finally:
        if conn:
            conn.close()
            sys.exit()
Пример #3
0
def get_code1(conn):
    """
    使用常规存储方式存储大盘指数行情数据获取 可获得上证指数、深证指数
    :return:
    """
    df_index = ts.get_index()
    sql_market = """
    INSERT INTO middle_news_market(
        code, name, change_market, open_market, preclose, close_market, high, low, volume, amount
        ) 
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    for i in range(0, 25):
        code = df_index['code'][i]
        name = df_index['name'][i]
        change_market = str('%.2f' % df_index['change'][i])
        open_market = str('%.4f' % df_index['open'][i])
        preclose = str('%.4f' % df_index['preclose'][i])
        close = str('%.4f' % df_index['close'][i])
        high = str('%.4f' % df_index['high'][i])
        low = str('%.4f' % df_index['low'][i])
        volume = str(df_index['volume'][i])
        amount = str('%.4f' % df_index['amount'][i])
        sql_params = [
            code, name, change_market, open_market, preclose, close, high, low,
            volume, amount
        ]
        logger.debug(sql_market)
        logger.debug(sql_params)
        # 存入数据库
        execute_sql(conn, sql_market, sql_params)
Пример #4
0
def get_news(conn, max_date, current_time):
    """
    华尔街见闻抓取
    :param conn:
    :param max_date: 数据库中最新新闻的日期
    :param current_time: 当前时间
    :return:
    """
    func_name = "采集华尔街见闻"
    logger.debug('start %s ' % func_name)
    spider_data = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
    driver = None
    try:
        xvfb = Xvfb(width=1280, height=720)
        xvfb.start()
        driver = webdriver.Firefox(executable_path=chromedriver_path)
        driver.get('https://wallstreetcn.com/live/global')
        # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距
        js = 'window.scrollBy(0,3000)'
        driver.execute_script(js)
        time.sleep(5)
        js = 'window.scrollBy(0,60000)'
        driver.execute_script(js)
        time.sleep(5)
        pages = driver.page_source
        soup = BeautifulSoup(pages, 'html.parser')

        soup1 = soup.find('div', class_='livenews-main')
        content = soup1.find_all('div', class_='live-item')
        news_source = '华尔街见闻'
        news_type = '宏观'
        last_news_time = '23:59'
        d_date = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
        for cont in content:
            news_time = cont.find('time', attrs={'class': 'live-item_created'}).get_text()
            news = cont.find('div', attrs={'class': 'live-item_main'}).find('div', attrs={'class': 'live-item_html'})
            if news is None:
                return
            news = news.get_text().strip().replace('//', '')
            if last_news_time < news_time:
                d_date = d_date - datetime.timedelta(days=1)
            s_date = d_date.strftime("%Y-%m-%d")
            over_time = s_date + ' ' + news_time
            if max_date > over_time:
                break
            sql_params = [over_time, spider_data, news_source, news_type, news]
            logger.debug(sql_cj)
            logger.debug(sql_params)
            execute_sql(conn, sql_cj, sql_params)
            last_news_time = news_time
        logger.debug('end %s ' % func_name)
    except Exception as e:
        msg = func_name + ' 处理失败: ' + str(e)
        logger.error(msg)
    finally:
        if driver:
            # driver.close()
            driver.quit()
            xvfb.stop()
Пример #5
0
def del_his_info(conn, his_dtm):
    """
    考虑到数据量及新闻的时效性,这里对与日期在两个月前的新闻进行删除
    :param conn:
    :param his_dtm: 当前日期倒减60天
    :return:
    """
    del_fun_nm = " 删除历史数据 "
    logger.debug('start %s ' % del_fun_nm)
    sql_del_params = (his_dtm, )
    sql_del = " DELETE FROM infos.rslt_url WHERE pub_dtm < %s "
    execute_sql(conn, sql_del, sql_del_params)
    logger.debug('end %s ' % del_fun_nm)
Пример #6
0
def remv_repeat_info(conn, sql_params):
    """
    判断是否有重复再进行插入
    :param conn:
    :param sql_params:
    :return:
    """
    sql_rept = " SELECT url_nm FROM infos.rslt_url WHERE url_nm=%s "
    rept_re = execute_select(conn, sql_rept, (sql_params[0], ))
    if len(rept_re) == 0:
        logger.debug('插入sql:%s' % sql_info)
        logger.debug('数据参数:%s' % sql_params)
        execute_sql(conn, sql_info, sql_params)
Пример #7
0
 def spider_Item(self, item, spider):
     try:
         with conn:
             sql_repeat = """
                 select * from public.db_movie where user_name=%s
             """
             print('piplines')
             res = execute_select(conn, sql_repeat, item['user_name'])
             if not res[0]:
                 sql_insert = """
                     INSERT INTO public.db_movie(user_name, comment_time, film_critics) 
                     VALUES(%s, %s, %s)
                 """
                 execute_sql(conn, sql_insert, item)
                 print('增加数据')
             else:
                 pass
     finally:
         if conn:
             conn.close()
Пример #8
0
def get_sina_news(conn, max_date, current_time):
    """
    爬取新浪财经突发live板块新闻
    :param conn:
    :param max_date: 数据库中最新新闻的日期
    :param current_time: 当前时间
    :return:
    """
    func_name = "采集新浪财经新闻"
    logger.debug('start %s ' % func_name)
    spider_data = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
    driver = None
    try:
        xvfb = Xvfb(width=1280, height=720)
        xvfb.start()
        driver = webdriver.Firefox(executable_path=chromedriver_path)
        for num in range(1, 2):
            # url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num)
            url = 'http://finance.sina.com.cn/7x24/'
            driver.get(url)
            # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距
            js = 'window.scrollBy(0,3000)'
            driver.execute_script(js)
            time.sleep(5)
            js = 'window.scrollBy(0,5000)'
            driver.execute_script(js)
            time.sleep(5)
            pages = driver.page_source
            xml = etree.HTML(pages)
            time_list = xml.xpath('//*[@class="bd_c0"]/div[@class="bd_list"]/div["bd_i"]/@data-time')
            soup = BeautifulSoup(pages, 'html.parser')
            save_file(soup.encode('utf-8'))
            soup1 = soup.find('div', id='liveList01')
            content = soup1.select('.bd_i')
            news_source = '新浪财经'
            for i in range(len(time_list)):
                time_stamp = time_list[i]
                data = content[i]
                over_time_1 = data.find('p', attrs={'class': 'bd_i_time_c'}).get_text()
                over_time = time_stamp + over_time_1
                over_time_d = datetime.datetime.strptime(over_time, "%Y%m%d%H:%M:%S")
                over_time = datetime.datetime.strftime(over_time_d, "%Y-%m-%d %H:%M:%S")
                if max_date <= over_time:
                    # data_type = data.find('p', attrs={'class': 'bd_i_tags'}).get_text().strip().replace("\n", "")
                    # news_type = data_type.replace(' ', '')
                    news_type = ''
                    try:
                        message = data.find('p', attrs={'class': 'bd_i_txt_c'}).get_text()
                        mes = re.sub(r"http(.*)", '', message)
                        news = re.sub('\s$', '', mes)
                    except Exception as e:
                        logger.error(e)
                    sql_params = [over_time, spider_data, news_source, news_type, news]
                    logger.debug(sql_cj)
                    logger.debug(sql_params)
                    execute_sql(conn, sql_cj, sql_params)
                else:
                    return
        logger.debug('end %s ' % func_name)
        sys.exit()
    except Exception as e:
        msg = func_name + ' 处理失败: ' + str(e)
        logger.error(msg)
    finally:
        if driver:
            # driver.close()
            driver.quit()
            xvfb.stop()
Пример #9
0
def sina(conn, ips):
    logger.debug('新浪微博评论采集...sina')
    ip = random.choice(ips)
    # 指定IP
    uid = '4193705642468999'
    url = 'https://m.weibo.cn/single/rcList?format=cards&id=' + uid + '&type=comment&hot=0&page={}'
    i = 200
    comment_num = 1  # 第几条评论
    try:
        for i in range(i + 1, 67000):
            ip = random.choice(ips)
            proxies = {'http': ip}
            headers = {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Connection": "keep-alive",
                "Cookie": "你的cookie",
                "Host": "m.weibo.cn",
                "Referer": "https://m.weibo.cn/status/" + uid,
                "User-Agent": random.choice(ua_list),
                "X-Requested-With": "XMLHttpRequest",
            }
            logger.debug(proxies)
            try:
                logger.debug(url.format(i))
                res = requests.get(url=url.format(i),
                                   headers=headers,
                                   proxies=proxies)
                r = res.json()
                content = r[0]['card_group']
                if res.status_code == 200:
                    logger.debug('抓取第%s页评论' % i)
                    for j in range(0, len(content)):
                        logger.debug('第%s条评论' % comment_num)
                        hot_data = content[j]
                        comment_id = hot_data['user']['id']  # 用户id
                        user_name = hot_data['user']['screen_name']  # 用户名
                        created_at = hot_data['created_at']  # 评论时间
                        # 评论内容
                        comment = re.sub(
                            '<.*?>|回复<.*?>:|[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]',
                            '', hot_data['text'])
                        like_counts = hot_data['like_counts']  # 点赞数
                        sql_params = [
                            comment_id, user_name, created_at, comment,
                            like_counts
                        ]
                        logger.debug(sql_params)
                        execute_sql(conn, sql, sql_params)
                        comment_num += 1
                    time.sleep(random.randint(2, 5))
            except requests.exceptions.ConnectionError:
                logger.debug('ConnectionError')
                if not ips:
                    logger.debug('ip 已失效')
                    sys.exit()
                # 删除不可用的代理IP
                if ip in ips:
                    ips.remove(ip)
    except Exception as e:
        logger.error(e)