Пример #1
0
    def process_item(self, item, spider):
        self.id = getattr(spider, "aid")
        self.userinfo = UserInfo.from_json(self.r.get(self.id))
        self.table_name = self.userinfo['table_name']
        # resp = requests.get("http://localhost:9998/crawling?append=%s" % self.id)
        # self.flag = resp.text
        # print('the flag:', self.flag)
        # print('type:',type(self.flag))
        # if self.flag:
        #     self.flag = str(self.flag,encoding='utf-8')
        self.mid = self.userinfo['if_store_data']
        self.flag = str(self.mid)
        print("My pipe flag is:", self.flag)

        if 'yes' not in self.flag:
            print('用户还没有输入')
            self.preitem.append(item)

        if 'yes' in self.flag:
            if self.preitem is not None:
                self.storeData()
                self.preitem = None

            final_list = []
            data = dict(item)
            v_list_length = len(list(data.values())[0])
            for i in range(v_list_length):
                final_list.append({})
            keys = ','.join(data.keys())
            values = ','.join(['%s'] * len(data))

            for i in range(v_list_length):
                for key in data.keys():
                    final_list[i][key] = data[key][i]
            print("final_list22222", final_list)
            sql = 'insert into %s(%s) values(%s)' % (self.table_name, keys,
                                                     values)
            # params = []
            # values = []
            # for k, v in item.fields.items():
            #     params.append('%s')
            #     value = item._values[k]
            #     values.append(value)
            # sql = 'insert into %s(%s) values(%s)' % (self.curType, ','.join(item.fields.keys()), ','.join(params))

            try:
                for sub_dict in final_list:
                    self.cursor.execute(sql, tuple(sub_dict.values()))
            except Exception as error:
                print('sql error .................................')
                print('error:', error)
            return item
Пример #2
0
 def __init__(self, id='', **kwargs):
     super(conSpider, self).__init__(**kwargs)
     self.conn = pymysql.connect(host=HOST,
                                 user=USER,
                                 passwd=PASSWD,
                                 db=DB,
                                 charset='utf8',
                                 port=PORT)
     self.cursor = self.conn.cursor()
     self.aid = id
     print("aid:", self.aid)
     self.r = redis.StrictRedis(host=RHOST, port=RPORT)
     self.userinfo = UserInfo.from_json(self.r.get(self.aid))
Пример #3
0
    def close(spider, reason):
        logging.info('spider关闭了')
        userinfo = UserInfo.from_json(spider.r.get(spider.aid))
        #由于closespider关闭需要时间,所以应该区分自然关闭还是强制关闭
        if not spider.click_button_flag:
            userinfo['spider_state'] = 'nature_close'
            userinfo['error_msg'] = '爬虫程序已经关闭'
        else:
            userinfo['spider_state'] = 'close'
        # userinfo['spider_state'] = 'close'

        spider.r.set(spider.aid, userinfo.to_json())
        logging.info('aha,the spider closed')
Пример #4
0
 def __init__(self, id='', **kwargs):
     super(ToolSpider, self).__init__(**kwargs)
     self.conn = pymysql.connect(host=HOST,
                                 user=USER,
                                 passwd=PASSWD,
                                 db=DB,
                                 charset='utf8',
                                 port=PORT)
     self.r = redis.StrictRedis(host=RHOST, port=RPORT)
     self.cursor = self.conn.cursor()
     self.aid = id
     self.click_button_flag = False
     self.bloom = BloomFilter(max_elements=100000, error_rate=0.05)
     self.userinfo = UserInfo.from_json(self.r.get(self.aid))
     self._getWebsitesInDB()
Пример #5
0
    def process_item(self, item, spider):
        self.id = getattr(spider, "aid")
        self.userinfo = UserInfo.from_json(self.r.get(self.id))
        print('userinfo in redis pipeline:', self.userinfo)
        logging.debug('piplines:start to Redis items')
        data = dict(item)
        # if not self.key:
        #     for k in data.keys():
        #         k = k + self.id
        #         self.key.append(k)

        for k, v in data.items():
            data_stored = self.userinfo['crawling_result']
            data_ = data_stored.get(k, [])
            data_.append(v)
            data_stored[k] = data_
        self.r.set(self.id, self.userinfo.to_json())
        return item
Пример #6
0
def group(r, id):
    """
    合成redis数据为[[,],]
    :return:
    """
    # id = str(random.randint(0, 10000))
    # print(id, type(id))
    if r.get(id):
        userinfo = UserInfo.from_json(r.get(id))
    else:
        # return None,None,None
        return None,None,'正在'
    error_msg = userinfo['error_msg']
    # print('userinfo_in_group:', userinfo['crawling_result'])
    # print('userinfo_in_group:',userinfo['if_store_data'])
    key = []  # 按字段名索引
    th = None
    get_th = "select COLUMN_NAME from information_schema.COLUMNS where table_name = '{}'".format(userinfo['table_name'])
    connect = con()
    with connect:
        cur = connect.cursor()
        cur.execute(get_th)
        th = cur.fetchall()
    for seg in th:
        key.append(seg[0])
    totalNo = 0
    keyExists = np.zeros((len(key), 1))
    for i in np.arange(len(key)):
        t = userinfo['crawling_result'].get(key[i], None)
        if t:
            keyExists[i] = 1
            totalNo += 1
    if totalNo == 0:
        return None, None,error_msg
    else:
        index = np.nonzero(keyExists)[0]
        key_defined = np.array(key)[index]
        # print('key_defined:',key_defined)
        # dis = [list(map(lambda x: x.decode('utf-8'), userinfo['crawling_result'].get(k))) for k in key_defined]
        # dis = [userinfo['crawling_result'].get(k) for k in key_defined]
        dis = [list(map(lambda x:str(x),userinfo['crawling_result'].get(k))) for k in key_defined]
        return key_defined.tolist(), dis, error_msg
Пример #7
0
    def process_item(self, item, spider):
        logging.info('helloworld1')
        self.id = getattr(spider, "aid")
        logging.info('helloworld2')
        logging.info('the id in pipeline:{}'.format(self.id))
        logging.info('the userinfo in pipeline is :%s' %
                     len(self.r.get(self.id)))
        self.userinfo = UserInfo.from_json(self.r.get(self.id))
        self.table_name = self.userinfo['table_name']
        self.mid = self.userinfo['if_store_data']
        self.flag = str(self.mid)
        logging.info('the flag in pipelines:%s' % self.flag)
        if 'yes' not in self.flag:
            logging.debug('用户还没有点击保存按钮')
            self.preitem.append(item)
        if 'yes' in self.flag:
            if self.preitem is not None:
                logging.debug('first restore data from redis to database')
                self.storeData()
                self.preitem = None
            logging.debug('piplines:start to process items')
            params = []
            values = []
            for k, v in item.fields.items():
                params.append('%s')
                value = item._values[k]
                values.append(value)

            sql = 'insert into %s(%s) values(%s)' % (self.table_name, ','.join(
                item.fields.keys()), ','.join(params))

            try:
                # print('pipeline_tuple:%s'%tuple(values))
                # self.cursor.execute(sql,(','.join(values)))
                print(values)
                self.cursor.execute(sql, tuple(values))
            except Exception as error:
                print('sql error .................................')
                print('error:', error)
            return item
Пример #8
0
def crawling():
    # r = redis.Redis(host=RHOST, port=RPORT)
    # 是否存储
    is_save = request.form.get('save')
    # 用户id(随机数)
    id = request.form.get('append')
    userinfo = UserInfo.from_json(r.get(id))

    if is_save:
        userinfo['if_store_data'] = is_save
        r.set(id, userinfo.to_json())
    if not is_save:
        is_save = userinfo['if_store_data']
    logging.info('用户选择是否存储爬取结果:%s'%is_save)
    if is_save:
        if "sw" in str(is_save):
            logging.info(is_save)

            userinfo['crawling_result'] = {}
            userinfo['spider_state'] = 'open'
            userinfo['if_store_data'] = 'start'

            # userinfo['crawling_result'] = {}
            # if userinfo['spider_state'] == 'close':
            #     userinfo['if_store_data'] = 'start'
            # else:userinfo['if_store_data'] = 'nosw'
            #等待spider关闭并调用close()方法
            # time.sleep(10)
            # userinfo['spider_state'] = 'open'

            userinfo['error_msg'] = '正在爬取中'
            r.set(id, userinfo.to_json())
            time.sleep(2)
            logging.info('userinfo in crawling:%s'% UserInfo.from_json(r.get(id))['crawling_result'])
            po = Pool(10)
            logging.info('before')
            logging.info('after:%s'%UserInfo.from_json(r.get(id))['spider_state'])
            time.sleep(5)
            po.apply_async(scrapyprocess, (id, "selenium",))

        if 'no' == str(is_save):
            userinfo['crawling_result'] = {}
            userinfo['spider_state'] = 'close'
            userinfo['if_store_data'] = 'no'
            userinfo['error_msg'] = '爬虫已经关闭'
            r.set(id, userinfo.to_json())

    # 用于保存当爬虫结束但是没有及时点击保存按钮的情况
    userinfo = UserInfo.from_json(r.get(id))
    print('userinfo_in_crawling:', userinfo)
    if userinfo['spider_state'] == 'nature_close' and 'yes' in str(is_save):
        logging.debug('views关闭了')
        storeData(userinfo)
        logging.info('数据保存后删除缓存')
        logging.info('居然关闭了')
        r.delete(id)
    if userinfo['spider_state'] == 'nature_close' and str(is_save) == 'no':
        logging.info('用户选择"停止并丢弃"按钮后删除缓存')
        logging.info('居然关闭了2')
        r.delete(id)
    return redirect(url_for('get_results', id=id))
Пример #9
0
def scrapy():
    form = SubForm()
    default_form = DefaultForm()
    if not (form.validate_on_submit() or default_form.validate_on_submit()):
        return redirect(url_for('root'))

    result = 0
    default_crawl_flag = 0

    way = request.form.get("cus")
    table_name = request.form.get('table_name')
    select = request.form.get("option")
    page_num = request.form.get("page")
    selenium_page = request.form.get("selenium_num")
    url_list_xpath = request.form.get("list_xpath")
    append_idt = request.form.get("append")
    input_xpath = request.form.get("input_xpath")
    website = form.website.data

    # mode:1.Tool(quick three layer mode) 2. content(slow two layer mode) 3.selenium(slow three layer mode)
    model = "Tool"
    if select == "two":
        model = "content"

    if way == "nodefault":
        fields_xpath_list = []
        for i in range(33):
            tb = "tb" + str(i)
            fields_xpath_list.append(request.form.get(tb))
        for list in fields_xpath_list:
            result = result or list
        if not result:
            return "未填写class或id属性"
    else:
        default_website = request.form.get('website_url')
        website = default_website
        default_crawl_flag = 1

    user = UserInfo()

    user['user_id'] = append_idt
    user['website'] = website
    user['table_name'] = table_name
    user['default_crawl_flag'] = default_crawl_flag
    user['two_or_three_mode'] = model
    user['fields_xpath_list'] = fields_xpath_list
    # 只用于两层爬取的页码输入
    user['page_num'] = page_num
    #  是否存储数据
    user['if_store_data'] = 'start'
    user['error_msg'] = '正在爬取中'
    user['url_list_xpath'] = url_list_xpath
    user['page_num_xpath_list'] = input_xpath
    # 3层爬取中从默认的快速爬取模式切换到慢速爬取后的页码数码
    user['selenium_page'] = selenium_page
    # 存储爬取结果
    user['crawling_result'] = {}
    # 爬虫的默认状态
    user['spider_state'] = 'open'
    # r = redis.Redis(host=RHOST, port=RPORT)
    r.set(user['user_id'], user.to_json())
    print('user:'******'user_id'], user['two_or_three_mode'],))
    return render_template("middle.html", append_idt=append_idt)