class HdfsPipeline(object): def __init__(self, **kwargs): self.table_cols_map = {} # 表字段顺序 {table:(cols, col_default)} self.bizdate = bizdate # 业务日期为启动爬虫的日期 self.buckets_map = {} # 桶 {table:items} self.bucketsize = kwargs.get('BUCKETSIZE') self.client = Client(kwargs.get('HDFS_URLS')) self.dir = kwargs.get('HDFS_FOLDER') # 文件夹路径 self.delimiter = kwargs.get('HDFS_DELIMITER') # 列分隔符,默认 hive默认分隔符 self.encoding = kwargs.get('HDFS_ENCODING') # 文件编码,默认 'utf-8' self.hive_host = kwargs.get('HIVE_HOST') self.hive_port = kwargs.get('HIVE_PORT') self.hive_dbname = kwargs.get('HIVE_DBNAME') # 数据库名称 self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE', False) # hive 是否自动建表,默认 False self.client.makedirs(self.dir) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls(**settings) def process_item(self, item, spider): """ :param item: :param spider: :return: 数据分表入库 """ if item.tablename in self.buckets_map: self.buckets_map[item.tablename].append(item) else: cols, col_default = [], {} for field, value in item.fields.items(): cols.append(field) col_default[field] = item.fields[field].get('default', '') cols.sort(key=lambda x: item.fields[x].get('idx', 1)) self.table_cols_map.setdefault( item.tablename, (cols, col_default)) # 定义表结构、字段顺序、默认值 self.buckets_map.setdefault(item.tablename, [item]) if self.hive_auto_create: self.checktable(item.tablename, cols) # 建表 self.buckets2db(bucketsize=self.bucketsize, spider_name=spider.name) # 将满足条件的桶 入库 return item def close_spider(self, spider): """ :param spider: :return: 爬虫结束时,将桶里面剩下的数据 入库 """ self.buckets2db(bucketsize=1, spider_name=spider.name) def checktable(self, tbname, cols): """ :return: 创建 hive 表 """ hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname) cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider'] create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)" hive.execute(create_sql) logger.info(f"表创建成功 <= 表名:{tbname}") def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value).replace( self.delimiter, '').replace('\n', '') new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name value = self.delimiter.join(new_item.values()) new_items.append(value) # 每张表是都是一个文件夹 folder = f"{self.dir}/{tablename}" self.client.makedirs(folder) filename = f"{folder}/data.txt" info = self.client.status(filename, strict=False) if not info: self.client.write(filename, data='', overwrite=True, encoding=self.encoding) try: content = '\n'.join(new_items) + '\n' self.client.write(filename, data=content, overwrite=False, append=True, encoding=self.encoding) logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
class ChatBotModel(object): def __init__(self, hadoop_url, hdfs_index_file, local_index_file, corpus_dir, unk_answer='', max_answer_len=1024): self.hadoop_url = hadoop_url self.hdfs_index_file = hdfs_index_file self.local_index_file = local_index_file self.corpus_dir = corpus_dir self.max_answer_len = max_answer_len self.unk_answer = unk_answer self.client = None self.inverted_index = {} def build_connection(self): self.client = Client(self.hadoop_url) def fetch_index_file(self): self.client.download(hdfs_path=self.hdfs_index_file, local_path=self.local_index_file, overwrite=True) def load_inverted_index(self): with open(self.local_index_file, 'r', encoding='utf-8') as f: for line in f: word, *querys = line.strip().split('\t') for query in querys: file_name, query_id, score = query.split(':') if word in self.inverted_index: self.inverted_index[word].append( [file_name, int(query_id), float(score)]) else: self.inverted_index[word] = [] self.inverted_index[word].append( [file_name, int(query_id), float(score)]) def prepare(self): self.build_connection() self.fetch_index_file() self.load_inverted_index() def read_corpus_answer(self, file_name, query_id): file_path = os.path.join(self.corpus_dir, file_name) file_status = self.client.status(file_path) if file_status['length'] <= query_id: return None with self.client.read(hdfs_path=file_path, offset=query_id, length=self.max_answer_len, encoding='utf-8') as f: answer = f.read().strip().split('\n')[0] return answer def predict_answer(self, query): words = jieba.lcut_for_search(query) querys = {} for word in words: if word not in self.inverted_index: continue for file_name, query_id, score in self.inverted_index[word]: query = (file_name, query_id) if query in querys: querys[query] += score else: querys[query] = score if len(querys) == 0: return self.unk_answer best_query = max(querys.items(), key=lambda x: x[1]) (best_file_name, best_query_id), best_score = best_query best_answer = self.read_corpus_answer(best_file_name, best_query_id) if best_answer is None: best_answer = self.unk_answer return best_answer
def post(self, request, *args, **kwargs): import uuid permission_classes = (IsAuthenticated,) start_time = time.time() file_serializer = DatasourceSerializer(data=request.data) if file_serializer.is_valid(): path = file_serializer.validated_data['file_name'] user = request.user.id # 上传文件的大小 filesize = round((path.size) / 1024 / 1024, 2) # 获取该用户所有文件的大小 mydata_id = DataSource.objects.filter(user_id=user) myData_size = 0 for i in mydata_id: try: x = i.fileSize.replace("KB", '') myData_size += float(x) except: continue myData_size = round(myData_size / 1024, 2) # 单位MB # 该用户即将上传文件加本身有的大小 now_userDataSize = filesize + myData_size # 查找用户所限制文件上传容量的大小 user_dataSize_old = UserLevel.objects.get(user_id=user).up_load print(type(user_dataSize_old)) if now_userDataSize > user_dataSize_old: return Response({'msg': '您的数据容量不足,请清理数据后在尝试', 'status': False}) # if 1 > 2: # pass else: try: is_header = file_serializer.validated_data['is_header'] # user =1 separator = file_serializer.validated_data['column_delimiter'] except: # 数据库文件没有表头,所以设置 is_header = '' separator = '\n' last = (str(path).lower()).split('.')[-1].upper() if last == 'CSV' or last == 'TXT' or last == 'SQL': if path.size > LIMIT_FILE_SIZE: format_name = uuid.uuid1() file_serializer.validated_data['format_filename'] = format_name file_serializer.save() client = Client(HDFS_HOST) file_path = os.path.join(settings.MEDIA_ROOT, str(path)) with open(file_path, 'rb') as f1: # 判断文件的编码 data_type = chardet.detect(f1.readline())['encoding'] if data_type == 'None': return Response({'msg': '数据格式有误', 'status': False}) os.renames(file_path, os.path.join(settings.MEDIA_ROOT, str(format_name))) client.upload("/datahoop", os.path.join(settings.MEDIA_ROOT, str(format_name)), n_threads=4) os.remove(os.path.join(settings.MEDIA_ROOT, str(format_name))) try: with client.read('/datahoop/' + str(format_name), encoding=data_type) as reader: filesize = ((client.status('/datahoop/' + str(format_name)))['length']) / 1024 filesize = str(round(filesize, 2)) + 'KB' reader = reader.readlines() except: return Response({'msg': '数据读取失败', 'status': False}) column_delimiter = separator if is_header == 1: title = (reader[0]).split(column_delimiter) json = {} s = ((reader[0]).split(column_delimiter)) for i in s: json[i.replace('\r\n', '')] = [typ.StringType, True] print(json) else: total = len((reader[0]).split(column_delimiter)) title = [] for i in range(total): title.append('_C' + str(i)) json = {} for i in title: json[i] = [typ.StringType, True] column_num = len((reader[0]).split(column_delimiter)) row_num = len(reader) DataSource.objects.filter(format_filename=format_name).update(user_id=user, title=title[:20], fileSize=filesize, where='hdfs', row_num=row_num, column_num=column_num) over_time = time.time() print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, format_name, over_time - start_time)) return Response({'msg': '数据存储成功', 'status': True}) else: global object_id filePath = os.path.join(settings.MEDIA_ROOT, str(path)) file_serializer.save() filesize = str(round((path.size) / 1024, 2)) + 'KB' if last == 'XLS' or last == 'XLSX': pass elif last == 'TXT': object_id = tools.save_mongo_txt(filePath, user, is_header, separator, str(path)) if object_id != 'none': file_serializer.validated_data['obj_id'] = object_id file_serializer.validated_data['file_name'] = str(path) file_serializer.save() else: DataSource.objects.filter(file_name=str(path), user=1).delete() os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) return Response({'msg': '数据格式有误', 'status': False}) elif last == 'CSV': object_id = tools.save_mongo_csv(filePath, user, is_header, separator, str(path)) if object_id != 'none': file_serializer.validated_data['obj_id'] = object_id file_serializer.validated_data['file_name'] = str(path) file_serializer.save() else: # uuid = uuid.uuid1() # file_serializer.validated_data['obj_id'] = uuid # file_serializer.validated_data['file_name'] = str(path) # file_serializer.save() DataSource.objects.filter(file_name=str(path), user=1).delete() os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) return Response({'msg': '数据格式有误', 'status': False}) elif last == 'SQL': try: object_id = tools.save_mongo_sql(filePath, user) file_serializer.validated_data['obj_id'] = object_id file_serializer.validated_data['file_name'] = str(path) file_serializer.save() except Exception as e: DataSource.objects.filter(file_name=str(path), user=1).delete() os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) return Response({'msg': '数据格式有误', 'status': False}) with open(filePath, 'rb') as f1: # 判断文件的编码 data_type = chardet.detect(f1.readline())['encoding'] with open(filePath, encoding=data_type, errors='ignore') as reader: # 按编码读文件 reader = reader.readlines() if is_header == 1: title = (reader[0]).split(separator) json = {} s = ((reader[0]).split(separator)) for i in s: json[i.replace('\r\n', '')] = [typ.StringType, True] column_num = len((reader[0]).split(separator)) else: if last != 'SQL': total = len((reader[0]).split(separator)) title = [] for i in range(total): title.append('_C' + str(i)) json = {} for i in title: json[i] = [typ.StringType, True] column_num = len((reader[0]).split(separator)) else: total = re.findall(r'[^()]+', reader[0])[1].split(',') title = [] for i in range(len(total)): title.append('_C' + str(i)) json = {} for i in title: json[i] = [typ.StringType, True] column_num = len(total) row_num = len(reader) DataSource.objects.filter(obj_id=object_id).update(user_id=user, title=title[:20], fileSize=filesize, where='mongodb', row_num=row_num, column_num=column_num) os.remove(os.path.join(settings.MEDIA_ROOT, str(path))) over_time = time.time() print('ID为<%s>用户--数据上传<%s>文件的时间为--<%s>秒' % (user, path, over_time - start_time)) return Response({'msg': '数据存储成功', 'status': True}) else: return Response({'msg': '暂不支持此类文件上传', 'status': False}) else: return Response({'msg': '不是一个有效的数据', 'status': False})
def is_hdfs_directory(hdfs_client: Client, path: str): return hdfs_client.status(path)["type"] == "DIRECTORY"
from hdfs import Client HDFS_ClIENT = "http://172.16.122.21:50070;http://172.16.122.24:50070" file_dir = '/tmp/way' file_name = '/tmp/way/test.txt' file_name2 = '/tmp/way/test123.txt' loacl_file_name = 'test.txt' client = Client(HDFS_ClIENT) # 创建文件夹 client.makedirs(file_dir) # 返回目标信息 info = client.status(file_name, strict=False) print(info) # 写入文件(覆盖) client.write(file_name, data="hello hdfs !", overwrite=True) # 写入文件(追加) client.write(file_name, data="hello way !", overwrite=False, append=True) # 读取文件内容 with client.read(file_name, encoding='utf-8') as f: print(f.read()) # 文件下载 client.download(file_name, loacl_file_name, overwrite=True)
class RF_HDFS(object): def __init__(self): self.client = None self.directory = None def connect_and_login(self, **kwargs): import requests host = None port = None user = None password = None root = None timeout = None proxy = None if 'host' in kwargs: host = kwargs['host'] if 'port' in kwargs: port = kwargs['port'] if 'kdc' in kwargs: kdc = kwargs['kdc'] if 'user' in kwargs: user = kwargs['user'] if 'password' in kwargs: password = kwargs['password'] if 'root' in kwargs: root = kwargs['root'] if 'proxy' in kwargs: proxy = kwargs['proxy'] if 'timeout' in kwargs: timeout = kwargs['timeout'] self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_maxsize=0) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.session.headers.update({'Connection':'Keep-Alive'}) self.connectionStatus = False try: timeout = int(timeout) url = "http://" + host + ":" + str(port) hdfsLogin = WebHDFS(url, kdc) cookieStr = hdfsLogin.authenticate(user, password) if cookieStr != None: cookieList = cookieStr.split('=', 1) cookieDict = {cookieList[0]: cookieList[1]} requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict) self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) self.connectionStatus = True return self.client def checkConnectionStatus(self): return self.connectionStatus def list_dir(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=True) else: output = self.client.list(self.client.root, status=True) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def list_names(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=False) else: output = self.client.list(self.client.root, status=False) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def upload(self, remote_path, local_path, overwrite=False, permission=None): output = None try: output = self.client.upload(remote_path, local_path, overwrite, permission=permission) except HdfsError as hdfsError: # For some reason this exception includes the entire stack trace after # the error message, so split on '\n' and only return the first line. error = str(hdfsError).splitlines()[0] raise HdfsLibraryError(error) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def download(self, remote_path, local_path, overwrite=False): output = None try: output = self.client.download(remote_path, local_path, overwrite) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def mkdir(self, directory, permission): try: # no return value self.client.makedirs(directory, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rmdir(self, directory): try: # no return value if self.client.delete(directory, recursive=True) == False: raise HdfsLibraryError("Directory does not exist: %r", directory) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rename(self, src_file, dst_file): try: # no return value self.client.rename(src_file, dst_file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def delete(self, file): try: # no return value if self.client.delete(file) == False: raise HdfsLibraryError("File does not exist: %r", file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_time(self, file, mod_time): try: # no return value self.client.set_times(file, -1, mod_time) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_owner(self, file, owner, group): try: # no return value self.client.set_owner(file, owner=owner, group=group) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_permission(self, file, permission): try: # no return value self.client.set_permission(file, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_acl(self, file, aclspec): try: # no return value self.client.set_acl(file, aclspec=aclspec) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def status(self, path): output = '' try: output = self.client.status(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def checksum(self, path): output = '' try: output = self.client.checksum(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def close(self): self.session.close()