예제 #1
0
class BiliaprioriPipeline(object):
    def __init__(self):
        self.client = Client("http://fantome:50070")

    def process_item(self, item, spider):
        print("Get Tag", "  tag:", item['tagInfo'])

        #持续写入
        self.client.write('/bili_3-7day/tagInfo.txt',
                          item['tagInfo'] + "\n",
                          overwrite=False,
                          append=True,
                          encoding="utf-8")
예제 #2
0
class HDFSUtil:
    def __init__(self, url):
        self._client = Client(url)

    def make_dir(self, hdfs_path):
        """
        支持递归创建多级目录
        :param hdfs_path:
        :return:
        """
        self._client.makedirs(hdfs_path)

    def delete_hdfs_file(self, hdfs_path):
        """
        删除HDFS文件

        如果是目录, 必须为空
        :param hdfs_path:
        :return:
        """
        self._client.delete(hdfs_path)

    def delete_hdfs_dir(self, hdfs_dir):
        """
        删除HDFS文件/目录

        如果目录不为空, 递归删除
        :param hdfs_dir:
        :return:
        """
        dir_list = self.hdfs_dir_list(hdfs_dir)
        if dir_list is None or len(dir_list) == 0:
            print('Delete File: {0}'.format(hdfs_dir))
            self._client.delete(hdfs_dir)
        else:
            for file_name in dir_list:
                self.delete_hdfs_dir(hdfs_dir + '/' + file_name)
            print('Delete Dir: {0}'.format(hdfs_dir))
            self._client.delete(hdfs_dir)

    def upload_to_hdfs(self, local_path, hdfs_path):
        """
        将本地文件/目录上传到HDFS上

        如果目录不存在, 会自动创建
        :param local_path:
        :param hdfs_path:
        :return:
        """
        self._client.upload(hdfs_path, local_path, cleanup=True)

    def download_from_hdfs(self, hdfs_path, local_path):
        """
        将HDFS上的文件/目录下载到本地
        :param hdfs_path:
        :param local_path:
        :return:
        """
        self._client.download(hdfs_path, local_path, overwrite=True)

    def write_to_hdfs(self, hdfs_path, data, overwrite=False, append=True):
        """
        追加: overwrite=false, append=true => Default
        复写: overwrite=true, append=false

        overwrite和append逻辑必须互斥
        :param hdfs_path:
        :param data:
        :param overwrite: Boolean 是否复写
        :param append: Boolean 是否追加
        :return:
        """
        if not self._client.content(hdfs_path, strict=False):
            print('File Not exist in HDFS')
        self._client.write(hdfs_path, data, overwrite=overwrite, append=append)

    def move_or_rename(self, hdfs_src_path, hdfs_dst_path):
        """
        文件移动/重命名
        :param hdfs_src_path:
        :param hdfs_dst_path:
        :return:
        """
        self._client.rename(hdfs_src_path, hdfs_dst_path)

    def hdfs_dir_list(self, hdfs_path):
        """
        获取指定目录下的文件
        当hdfs_path不是目录, 捕获异常并返回None
        :param hdfs_path:
        :return: List[filename] or None
        """
        try:
            return self._client.list(hdfs_path, status=False)
        except HdfsError:
            return None
예제 #3
0
class HdfsPipeline(object):
    def __init__(self, **kwargs):
        self.table_cols_map = {}  # 表字段顺序 {table:(cols, col_default)}
        self.bizdate = bizdate  # 业务日期为启动爬虫的日期
        self.buckets_map = {}  # 桶 {table:items}
        self.bucketsize = kwargs.get('BUCKETSIZE')
        self.client = Client(kwargs.get('HDFS_URLS'))
        self.dir = kwargs.get('HDFS_FOLDER')  # 文件夹路径
        self.delimiter = kwargs.get('HDFS_DELIMITER')  # 列分隔符,默认 hive默认分隔符
        self.encoding = kwargs.get('HDFS_ENCODING')  # 文件编码,默认 'utf-8'
        self.hive_host = kwargs.get('HIVE_HOST')
        self.hive_port = kwargs.get('HIVE_PORT')
        self.hive_dbname = kwargs.get('HIVE_DBNAME')  # 数据库名称
        self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE',
                                           False)  # hive 是否自动建表,默认 False
        self.client.makedirs(self.dir)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(**settings)

    def process_item(self, item, spider):
        """
        :param item:
        :param spider:
        :return: 数据分表入库
        """
        if item.tablename in self.buckets_map:
            self.buckets_map[item.tablename].append(item)
        else:
            cols, col_default = [], {}
            for field, value in item.fields.items():
                cols.append(field)
                col_default[field] = item.fields[field].get('default', '')
            cols.sort(key=lambda x: item.fields[x].get('idx', 1))
            self.table_cols_map.setdefault(
                item.tablename, (cols, col_default))  # 定义表结构、字段顺序、默认值
            self.buckets_map.setdefault(item.tablename, [item])
            if self.hive_auto_create:
                self.checktable(item.tablename, cols)  # 建表
        self.buckets2db(bucketsize=self.bucketsize,
                        spider_name=spider.name)  # 将满足条件的桶 入库
        return item

    def close_spider(self, spider):
        """
        :param spider:
        :return:  爬虫结束时,将桶里面剩下的数据 入库
        """
        self.buckets2db(bucketsize=1, spider_name=spider.name)

    def checktable(self, tbname, cols):
        """
        :return: 创建 hive 表
        """
        hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname)
        cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider']
        create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)"
        hive.execute(create_sql)
        logger.info(f"表创建成功 <= 表名:{tbname}")

    def buckets2db(self, bucketsize=100, spider_name=''):
        """
        :param bucketsize:  桶大小
        :param spider_name:  爬虫名字
        :return: 遍历每个桶,将满足条件的桶,入库并清空桶
        """
        for tablename, items in self.buckets_map.items(
        ):  # 遍历每个桶,将满足条件的桶,入库并清空桶
            if len(items) >= bucketsize:
                new_items = []
                cols, col_default = self.table_cols_map.get(tablename)
                for item in items:
                    keyid = rowkey()
                    new_item = {'keyid': keyid}
                    for field in cols:
                        value = item.get(field, col_default.get(field))
                        new_item[field] = str(value).replace(
                            self.delimiter, '').replace('\n', '')
                    new_item['bizdate'] = self.bizdate  # 增加非业务字段
                    new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                    new_item['spider'] = spider_name
                    value = self.delimiter.join(new_item.values())
                    new_items.append(value)

                # 每张表是都是一个文件夹
                folder = f"{self.dir}/{tablename}"
                self.client.makedirs(folder)

                filename = f"{folder}/data.txt"
                info = self.client.status(filename, strict=False)
                if not info:
                    self.client.write(filename,
                                      data='',
                                      overwrite=True,
                                      encoding=self.encoding)

                try:
                    content = '\n'.join(new_items) + '\n'
                    self.client.write(filename,
                                      data=content,
                                      overwrite=False,
                                      append=True,
                                      encoding=self.encoding)
                    logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}")
                    items.clear()  # 清空桶
                except Exception as e:
                    logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
예제 #4
0
file_dir = '/tmp/way'
file_name = '/tmp/way/test.txt'
file_name2 = '/tmp/way/test123.txt'
loacl_file_name = 'test.txt'

client = Client(HDFS_ClIENT)

# 创建文件夹
client.makedirs(file_dir)

# 返回目标信息
info = client.status(file_name, strict=False)
print(info)

# 写入文件(覆盖)
client.write(file_name, data="hello hdfs !", overwrite=True)

# 写入文件(追加)
client.write(file_name, data="hello way !", overwrite=False, append=True)

# 读取文件内容
with client.read(file_name, encoding='utf-8') as f:
    print(f.read())

# 文件下载
client.download(file_name, loacl_file_name, overwrite=True)

# 文件上传
client.upload(file_name + '111', loacl_file_name, cleanup=True)

# 删除文件