Пример #1
0
def main():
    now = datetime.datetime.now()
    current_time = now.strftime("%Y-%m-%d %H:%M:%S")
    one_day = now - datetime.timedelta(days=1)
    one_day_ago = one_day.strftime('%Y-%m-%d %H:%M:%S')[:16]
    history_day = (now - datetime.timedelta(days=10)).strftime('%Y-%m-%d %H:%M:%S')[:16]
    conn = None
    try:
        conn = get_conn()
        with conn:
            sql_max_date = """
                SELECT max(CASE WHEN news_source='新浪财经' THEN news_date END),
                    max(CASE WHEN news_source='华尔街见闻' THEN news_date END)
                FROM news_cj
            """
            res = execute_select(conn, sql_max_date)
            max_date_sina = res[0][0] if res[0][0] else one_day_ago
            max_date_news = res[0][1] if res[0][1] else one_day_ago
            sql_delete = """
                DELETE FROM news_cj
                WHERE news_date <= %s  
                    OR (news_source='华尔街见闻' AND news_date=%s) OR (news_source='新浪财经' AND news_date=%s)
            """
            execute_sql(conn, sql_delete, (history_day, max_date_news, max_date_sina))
            get_news(conn, max_date_news, current_time)
            get_sina_news(conn, max_date_sina, current_time)
    except Exception as e:
        logger.error(str(e))
    finally:
        if conn:
            conn.close()
            sys.exit()
Пример #2
0
def main():
    conn = None
    try:
        get_code()
        conn = get_conn()
        with conn:
            get_rate(conn)
    finally:
        if conn:
            conn.close()
Пример #3
0
def main():
    conn = None
    try:
        conn = get_conn()
        with conn:
            ips = []
            for i in range(6000):
                # 每隔1000次重新获取一次最新的代理IP,每次可获取最新的100个代理IP
                if i % 1000 == 0:
                    ips.extend(get_ip())
            sina(conn, ips)
    finally:
        if conn:
            conn.close()
Пример #4
0
def deal():
    remv_his_log()
    last_mon = (now -
                datetime.timedelta(days=20)).strftime("%Y-%m-%d %H:%M:%S")
    str_his_dtm = (now -
                   datetime.timedelta(days=60)).strftime("%Y-%m-%d %H:%M:%S")
    logger.debug('last_mon:%s, str_his_dtm:%s ' % (last_mon, str_his_dtm))
    conn = None
    try:
        conn = get_conn()
        with conn:
            del_his_info(conn, str_his_dtm)
            # 查找出关键字
            sql_key_word = " SELECT word FROM infos.hot_keyword "
            re_key_word = execute_select(conn, sql_key_word)
            key_word_list = []
            for key_word in re_key_word:
                key_word_list.append(key_word[0])

            # 查找网站地址
            sql_url_link = " SELECT url, url_nm FROM infos.url_link "
            re_url_link = execute_select(conn, sql_url_link)
            url_list = []
            for url in re_url_link:
                url_list.append(url[0])

            # 查找相关百家号
            sql_bj_num = " SELECT bj_num FROM infos.url_link WHERE bj_num>'' "
            re_bj_num = execute_select(conn, sql_bj_num)
            bj_num_list = []
            for bj_num in re_bj_num:
                bj_num_list.append(bj_num)

            key_word_srch(conn, last_mon, key_word_list, url_list, bj_num_list)
    except Exception as e:
        logger.error(str(e))
    finally:
        if conn:
            conn.close()
Пример #5
0
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import logging.config

from config import logger_path
from common.pgutils import get_conn, execute_select, execute_sql

logging.config.fileConfig(logger_path)
logger = logging.getLogger("root")
conn = get_conn()


class DoubanQrsPipeline(object):
    def process_item(self, item, spider):
        return item


class crawlerDouban(object):
    def spider_Item(self, item, spider):
        try:
            with conn:
                sql_repeat = """
                    select * from public.db_movie where user_name=%s
                """
                print('piplines')
                res = execute_select(conn, sql_repeat, item['user_name'])
                if not res[0]: