Пример #1
0
def spider_basic(user_id):

    MYSQL_COON = MYSQL()

    url_info = "https://taobaolive.taobao.com/api/broadcaster_info/1.0?accountId={}".format(
        user_id)
    url_goods_list = "https://taobaolive.taobao.com/api/item_list/1.0?type=0&liveId="
    url_live = "https://taobaolive.taobao.com/room/index.htm?userId={}".format(
        user_id)

    HEADERS = {
        "User-Agent":
        random.choice(USER_AGENTS),
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':
        'en-US,en;q=0.5',
        'Connection':
        'keep-alive',
        'Accept-Encoding':
        'gzip, deflate',
        #'Host': "taobaolive.taobao.com",
        "Referer":
        "https://taobaolive.taobao.com/room/index.htm?userId={}".format(
            user_id)
    }

    SESSION.headers.update(HEADERS)

    response = session_get(url_live)
    response_text = response.text
    #print response_text
    live_id = re.search(r'liveId":(\d+)', response_text).group(1)
    # print live_id == str(0)

    #live_basic_info = {}

    goods_json_dict = get_goods_list(url_goods_list, live_id)
    info_json_dict = get_info(url_info)

    live_basic_info = dict(goods_json_dict, **info_json_dict)
    live_basic_info["zhubo_id"] = user_id

    live_basic_info["live_id"] = live_id

    if live_id == str(0):
        live_basic_info["is_live"] = 0
    else:
        live_basic_info["is_live"] = 1

    live_basic_info["crawl_time"] = datetime.now()

    live_basic_info["live_url"] = url_live

    #return live_basic_info
    #print repr(live_basic_info).decode("unicode-escape")
    logging.info("Spider one item into {}".format(LIVE_BASIC_TABLE))
    MYSQL_COON.insert_into_table(live_basic_info, LIVE_BASIC_TABLE)
Пример #2
0
def insert_to_db(results, which_table):
    MYSQL_CONN = MYSQL()
    for each_result in results:
        try:
            MYSQL_CONN.insert_into_table(each_result, which_table)
            logging.info("spider one item into {}".format(which_table))
        except Exception as e:
            logging.error(str(each_result))
            logging.error(e)
Пример #3
0
class TASK_OBJECT(object):
    '''
    This class has 6 parameters.
    from_table: the module is begin from which table to get the data
    from_table_condition: select from the table's condition, the default is empty list
    need_to_update: the module's begin need to update the table or not
    which_module: which module you need to start
    into_table: the module's results is need to insert into which table
    need_to_return: the module's multiprocessing need to return the data or not
    '''
    def __init__(self, from_table, from_table_condition, need_to_update,
                 which_module, into_table, need_to_return, which_need_in_row,
                 update_into_table):
        super(TASK_OBJECT, self).__init__()

        self.MYSQL_CONN = MYSQL()
        self.from_table = from_table
        self.from_table_condition = from_table_condition
        self.need_to_update = need_to_update
        self.which_module = which_module
        self.into_table = into_table
        self.need_to_return = need_to_return
        self.which_need_in_row = which_need_in_row
        self.update_into_table = update_into_table

    def get_rows(self):

        rows = self.MYSQL_CONN.select_from_table(self.from_table,
                                                 self.from_table_condition)

        for row in rows:
            if self.need_to_update:
                row = get_update_state(row)
            if row:
                yield row

    def multiprocess_task(self, new_list):
        pool = ThreadPool(THREAD_NUM)
        if self.need_to_return:
            results = pool.map(self.which_module, new_list)
        else:
            pool.map(self.which_module, new_list)
        pool.close()
        pool.join()
        if self.need_to_return:
            return results

    def insert_to_db(self, results):

        for each_result in results:
            try:
                if self.update_into_table:
                    self.MYSQL_CONN.insert_into_table_with_replace(
                        each_result, self.into_table)
                else:
                    self.MYSQL_CONN.insert_into_table(each_result,
                                                      self.into_table)
            except Exception as e:
                logging.error(str(each_result))
                logging.error(e)

    def task_main(self):

        rows = self.get_rows()

        new_list = []

        for row in rows:
            new_list.append(row[self.which_need_in_row])
            if len(new_list) % THREAD_NUM == 0:
                if self.need_to_return:
                    results = self.multiprocess_task(new_list)
                    self.insert_to_db(results)
                else:
                    self.multiprocess_task(new_list)
                new_list = []
        if new_list:
            if self.need_to_return:
                results = self.multiprocess_task(new_list)
                self.insert_to_db(results)
            else:
                self.multiprocess_task(new_list)
            new_list = []