示例#1
0
 def buckets2db(self, bucketsize=100, spider_name=''):
     """
     :param bucketsize:  桶大小
     :param spider_name:  爬虫名字
     :return: 遍历每个桶,将满足条件的桶,入库并清空桶
     """
     for tablename, items in self.buckets_map.items(
     ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
         if len(items) >= bucketsize:
             new_items = []
             cols, col_default, col_type = self.table_cols_map.get(
                 tablename)
             for item in items:
                 keyid = rowkey()
                 new_item = {'keyid': keyid}
                 for field in cols:
                     value = item.get(field, col_default.get(field))
                     new_item[field] = str(value)
                 new_item['bizdate'] = self.bizdate  # 增加非业务字段
                 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                 new_item['spider'] = spider_name
                 new_items.append(new_item)
             try:
                 df = pd.DataFrame(new_items)
                 df.to_sql(tablename,
                           con=self.engine,
                           index=False,
                           if_exists='append',
                           dtype=col_type)
                 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                 items.clear()  # 清空桶
             except Exception as e:
                 logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
示例#2
0
 def buckets2db(self, bucketsize=100, spider_name=''):
     """
     :param bucketsize:  桶大小
     :param spider_name:  爬虫名字
     :return: 遍历每个桶,将满足条件的桶,入库并清空桶
     """
     for tablename, items in self.buckets_map.items(
     ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
         if len(items) >= bucketsize:
             new_items = []
             for item in items:
                 keyid = rowkey()
                 new_item = {'keyid': keyid}
                 for key, value in item.items():
                     new_item[key] = value
                 new_item['bizdate'] = self.bizdate  # 增加非业务字段
                 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                 new_item['spider'] = spider_name
                 new_items.append(new_item)
             try:
                 dtypedict, columns = self.get_struct(
                     new_items[0], self.col_type)
                 df = pd.DataFrame(new_items, columns=columns)
                 df.to_sql(tablename,
                           con=self.engine,
                           index=False,
                           if_exists='append',
                           dtype=dtypedict)
                 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                 items.clear()  # 清空桶
             except Exception as e:
                 logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
示例#3
0
 def buckets2db(self, bucketsize=100, spider_name=''):
     """
     :param bucketsize:  桶大小
     :param spider_name:  爬虫名字
     :return: 遍历每个桶,将满足条件的桶,入库并清空桶
     """
     for tablename, items in self.buckets_map.items(
     ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
         if len(items) >= bucketsize:
             connection = self.get_connect()
             table = connection.table(tablename)
             bat = table.batch()
             for item in items:
                 keyid = rowkey()
                 values = {}
                 for key, value in item.items():
                     values['cf:' + key] = value
                 values['cf:bizdate'] = self.bizdate  # 增加非业务字段
                 values['cf:ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                    time.localtime())
                 values['cf:spider'] = spider_name
                 bat.put(keyid, values)  # 将清洗后的桶数据 添加到批次
             try:
                 bat.send()  # 批次入库
                 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                 items.clear()  # 清空桶
             except Exception as e:
                 logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
             finally:
                 connection.close()
示例#4
0
 def buckets2db(self, bucketsize=100, spider_name=''):
     """
     :param bucketsize:  桶大小
     :param spider_name:  爬虫名字
     :return: 遍历每个桶,将满足条件的桶,入库并清空桶
     """
     for tablename, items in self.buckets_map.items(
     ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
         if len(items) >= bucketsize:
             new_items = []
             for item in items:
                 keyid = rowkey()
                 new_item = {'_id': keyid}
                 for key, value in item.items():
                     new_item[key] = value
                 new_item['bizdate'] = self.bizdate  # 增加非业务字段
                 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                 new_item['spider'] = spider_name
                 new_items.append(new_item)
             try:
                 self.mongodb[tablename].insert_many(new_items)
                 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                 items.clear()  # 清空桶
             except Exception as e:
                 logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
示例#5
0
 def buckets2db(self, bucketsize=None):
     """
     :param bucketsize:  桶大小
     :return: 遍历每个桶,将满足条件的桶,入库并清空桶
     """
     if bucketsize is None:
         bucketsize = self.bucketsize
     for tablename, items in self.buckets_map.items(
     ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
         if len(items) >= bucketsize:
             cols, col_default = self.table_cols_map.get(tablename)
             connection = self.get_connect()
             table = connection.table(tablename)
             bat = table.batch()
             for item in items:
                 keyid = rowkey()
                 values = {}
                 for field in cols:
                     value = item.get(field, col_default.get(field))
                     values['cf:' + field] = str(value)
                 values['cf:bizdate'] = self.bizdate  # 增加非业务字段
                 values['cf:ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                    time.localtime())
                 values['cf:spider'] = self.name
                 bat.put(keyid, values)  # 将清洗后的桶数据 添加到批次
             try:
                 bat.send()  # 批次入库
                 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                 items.clear()  # 清空桶
             except Exception as e:
                 logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
             finally:
                 connection.close()
示例#6
0
 def buckets2db(self, bucketsize=None):
     """
     :param bucketsize:  桶大小
     :return: 遍历每个桶,将满足条件的桶,入库并清空桶
     """
     if bucketsize is None:
         bucketsize = self.bucketsize
     for tablename, items in self.buckets_map.items(
     ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
         if len(items) >= bucketsize:
             new_items = []
             cols, col_default = self.table_cols_map.get(tablename)
             for item in items:
                 keyid = rowkey()
                 new_item = {'_id': keyid}
                 for field in cols:
                     value = item.get(field, col_default.get(field))
                     new_item[field] = str(value)
                 new_item['bizdate'] = self.bizdate  # 增加非业务字段
                 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                 new_item['spider'] = self.name
                 new_items.append(new_item)
             try:
                 self.mongodb[tablename].insert_many(new_items)
                 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                 items.clear()  # 清空桶
             except Exception as e:
                 logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
示例#7
0
    def buckets2db(self, bucketsize=None):
        """
        :param bucketsize:  桶大小
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        if bucketsize is None:
            bucketsize = self.bucketsize
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                header = ''
                new_items = []
                cols, col_default = self.table_cols_map.get(tablename)
                for item in items:
                    keyid = rowkey()
                    new_item = {'keyid': keyid}
                    for field in cols:
                        value = item.get(field, col_default.get(field))
                        new_item[field] = str(value).replace(
                            self.delimiter, '').replace('\n', '')
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = self.name
                    if not header:
                        header = self.delimiter.join(new_item.keys())
                    value = self.delimiter.join(new_item.values())
                    # value = self.delimiter.join([new_item[key] for key in header.split(self.delimiter)])
                    new_items.append(value)

                fielder = f"{self.dir}/{self.name}"
                os.makedirs(fielder, exist_ok=True)

                filename = f"{fielder}/{tablename}.{self.type}"
                if self.writeheader:
                    if not os.path.exists(filename) or os.path.getsize(
                            filename) == 0:
                        with open(filename, 'w', encoding=self.encoding) as f:
                            f.writelines(header + '\n')

                try:
                    with open(filename, 'a', encoding=self.encoding) as f:
                        f.write('\n'.join(new_items) + '\n')
                    logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}")
                except Exception as e:
                    logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
                    logger.warning(
                        f"重新保存 <= 文件名:{tablename} 当前批次保存异常, 自动切换成逐行保存...")
                    for new_item in new_items:
                        try:
                            with open(filename, 'a',
                                      encoding=self.encoding) as f:
                                f.write(new_item + '\n')
                            logger.info(f"保存成功 <= 文件名:{tablename} 记录数:1")
                        except Exception as e:
                            logger.error(f"丢弃 <= 表名:{tablename} 丢弃原因:{e}")
                finally:
                    items.clear()  # 清空桶
示例#8
0
    def buckets2db(self, bucketsize=None):
        """
        :param bucketsize:  桶大小
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        if bucketsize is None:
            bucketsize = self.bucketsize
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                actions = []
                cols, col_default = self.table_cols_map.get(tablename)
                for item in items:
                    new_item = {}
                    for field in cols:
                        new_item[field] = item.get(field,
                                                   col_default.get(field))
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = self.name
                    action = {
                        '_op_type': 'index',  # 操作 index update create delete
                        '_index': self.name,  # index
                        '_id': rowkey(),
                        '_type': tablename,  # type
                        '_source': new_item
                    }
                    actions.append(action)

                try:
                    helpers.bulk(self.ES, actions=actions)
                    logger.info(
                        f"入库成功 <= 索引:{self.name}, 类型:{tablename} 记录数:{len(items)}"
                    )
                except Exception as e:
                    logger.error(
                        f"入库失败 <= 索引:{self.name}, 类型:{tablename} 错误原因:{e}")
                    logger.warning(
                        f"重新入库 <= 表名:{tablename} 当前批次入库异常, 自动切换成逐行入库...")
                    for action in actions:
                        try:
                            helpers.bulk(self.ES, actions=[action])
                            logger.info(
                                f"入库成功 <= 索引:{self.name}, 类型:{tablename} 记录数:1"
                            )
                        except Exception as e:
                            logger.error(
                                f"丢弃 <= 索引:{self.name}, 类型:{tablename} 丢弃原因:{e}"
                            )
                finally:
                    items.clear()  # 清空桶
示例#9
0
    def buckets2db(self, bucketsize=None):
        """
        :param bucketsize:  桶大小
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        if bucketsize is None:
            bucketsize = self.bucketsize
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                new_items = []
                cols, col_default, col_type = self.table_cols_map.get(
                    tablename)
                for item in items:
                    keyid = rowkey()
                    new_item = {'keyid': keyid}
                    for field in cols:
                        value = item.get(field, col_default.get(field))
                        new_item[field] = str(value)
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = self.name
                    new_items.append(new_item)

                try:
                    df = pd.DataFrame(new_items)
                    df.to_sql(tablename,
                              con=self.engine,
                              index=False,
                              if_exists='append',
                              dtype=col_type)
                    logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}")
                except Exception as e:
                    logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
                    logger.warning(
                        f"重新入库 <= 表名:{tablename} 当前批次入库异常, 自动切换成逐行入库...")
                    for new_item in new_items:
                        try:
                            df = pd.DataFrame([new_item])
                            df.to_sql(tablename,
                                      con=self.engine,
                                      index=False,
                                      if_exists='append',
                                      dtype=col_type)
                            logger.info(f"入库成功 <= 表名:{tablename} 记录数:1")
                        except Exception as e:
                            logger.error(f"丢弃 <= 表名:{tablename} 丢弃原因:{e}")
                finally:
                    items.clear()  # 清空桶
示例#10
0
    def buckets2db(self, bucketsize=None):
        """
        :param bucketsize:  桶大小
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        if bucketsize is None:
            bucketsize = self.bucketsize
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                new_items = []
                cols, col_default = self.table_cols_map.get(tablename)
                for item in items:
                    keyid = rowkey()
                    new_item = {'keyid': keyid}
                    for field in cols:
                        value = item.get(field, col_default.get(field))
                        new_item[field] = str(value).replace(
                            self.delimiter, '').replace('\n', '')
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = self.name
                    value = self.delimiter.join(new_item.values())
                    new_items.append(value)

                # 每张表是都是一个文件夹
                folder = f"{self.dir}/{tablename}"
                self.client.makedirs(folder)

                filename = f"{folder}/data.txt"
                info = self.client.status(filename, strict=False)
                if not info:
                    self.client.write(filename,
                                      data='',
                                      overwrite=True,
                                      encoding=self.encoding)

                try:
                    content = '\n'.join(new_items) + '\n'
                    self.client.write(filename,
                                      data=content,
                                      overwrite=False,
                                      append=True,
                                      encoding=self.encoding)
                    logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}")
                    items.clear()  # 清空桶
                except Exception as e:
                    logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")