def txtSave(self, inputFile, outputFile): try: Logger().get_log().error('将文件信息转换为bunch对象') catelist = os.listdir(inputFile) bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) # 将类别保存到Bunch对象中 for eachDir in catelist: print(eachDir, inputFile) eachPath = inputFile + r"\\" + eachDir + r"\\" fileList = os.listdir(eachPath) for eachFile in fileList: # 二级目录中的每个子文件 fullName = eachPath + eachFile # 二级目录子文件全路径 bunch.label.append(eachDir) # 当前分类标签 bunch.filenames.append(fullName) # 保存当前文件的路径 bunch.contents.append( TXTFile(fullName).read().strip()) # 保存文件词向量 with open(outputFile, 'wb') as file_obj: # 持久化必须用二进制访问模式打开 pickle.dump(bunch, file_obj) # pickle.dump(obj, file, [,protocol])函数的功能:将obj对象序列化存入已经打开的file中。 # obj:想要序列化的obj对象。 # file:文件名称。 # protocol:序列化使用的协议。如果该项省略,则默认为0。如果为负值或HIGHEST_PROTOCOL,则使用最高的协议版 except Exception as ex: Logger('error').get_log().error(ex)
def normalReader(self, line_format='user item rating', sep=','): try: self.reader = Reader(line_format=line_format, sep=sep) return Dataset.load_from_file(self.config['path'], reader=self.reader) except Exception as ex: Logger('error').get_log().error(ex) Logger('error').clear()
def evaluate(self, data, measures=['rmse', 'mae']): try: print('-----------------' + str(self.algo) + '-----------------------') return cross_validate(self.algo, data, measures=measures) except Exception as ex: Logger('error').get_log().error(ex) Logger('error').clear()
def adapter(self, sep, name_list): try: Logger().get_log().error('adapter is built') id, rid_to_name, name_to_rid = 0, {}, {} for name in name_list: if name not in name_to_rid.keys(): id += 1 name_to_rid[name] = id for key, val in name_to_rid.items(): rid_to_name[val] = key return rid_to_name, name_to_rid except Exception as ex: Logger('error').get_log().error(ex)
def getRS(self): try: print('获取推荐系统') path = self.config['path'] if self.config['type'] == 'KNN': self.models = [KNNBaselineRS(path), KNNBasicRS(path)] elif self.config['type'] == 'BaselineAlgorithms': self.models = [BaselineOnlyRS(path), NormalPredictorRS(path)] elif self.config['type'] == 'MatrixFactorization': self.models = [SVDRS(path), SVDppRS(path), NMFRS(path)] except Exception as ex: Logger('error').get_log().error(ex) Logger().clear()
def getNeighbors(self, namelist: list, target, n): ''' :param namelist: 所有用户的名称 :param target: 目标用户标签 :param n: 邻居个数 :return: 邻居对象标签列表 ''' try: # 获取用户名到用户id 和 用户id到用户名的映射 rid_to_name, name_to_rid = self.read_item_names(namelist) # Retieve inner id of the movie Toy Story toy_story_raw_id = name_to_rid[target] toy_story_inner_id = self.algo.trainset.to_inner_iid( toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = self.algo.get_neighbors(toy_story_inner_id, k=n) # Convert inner ids of the neighbors into names. toy_story_neighbors = (self.algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors) print('The' + n + ' nearest neighbors of ' + target + ' are:') for user in toy_story_neighbors: print(user) return toy_story_neighbors except Exception as ex: Logger('error').get_log().error(ex)
def reader(self, line_format='user item rating', sep=','): try: self.reader = Reader(line_format=line_format, sep=sep) self.data = Dataset.load_from_file(self.path, reader=self.reader) print(type(self.data)) except Exception as ex: Logger('error').get_log().error(ex)
def add(self, index, config = {'number_of_shards': 5, 'number_of_replicas': 0}): ''' :param config: 设置分片和备份 :return: ''' settings = { "settings": { "number_of_shards": config['number_of_shards'], # 一个分片 "number_of_replicas": config['number_of_replicas'] # 0个备份 }, "mappings": { "Document": { "dynamic": "strict", # 含义不明确 "properties": { "content": { "type": "text" }, "file_name": { "type": "text" }, "Date": { "type": "date" } } } } } try: if not self._es.indices.exists(index): self._es.indices.create(index = index, ignore = 400, body = settings) print('Created Index') except Exception as ex: Logger('error').get_log().error(ex, '创建失败')
def reader(self, line_format='user item rating', sep=','): try: self.reader = Reader(line_format=line_format, sep=sep) self.data = Dataset.load_from_file(self.path, reader=self.reader) self.rid_to_name, self.name_to_rid = self.read_item_names(sep) except Exception as ex: Logger('error').get_log().error(ex)
def filter(self): try: print('run start', datetime.datetime.now()) self.result, self.data = {}, {} print('筛选推荐系统') if self.config['reader'] == 'normal': self.reader = self.normalReader elif self.config['reader'] == 'json': self.reader = self.jsonReader pool = multiprocessing.Pool(processes= 4) for index, model in enumerate(self.models[:4]): print('model' + str(index) + '读取数据') self.data[index] = pool.apply(self.reader, (self.config['line_format'], self.config['sep'])) pool.close() pool.join() print('读取数据完毕') pool = multiprocessing.Pool(processes= min(4, len(self.models))) for index, model in enumerate(self.models[:4]): print('模型' + str(model) + '评估') self.result[index] = pool.apply_async(model.evaluate, (self.data[index], ['rmse', 'mae'] )) pool.close() pool.join() MaxRMSE, MaxMAE, index = 0, 0, 0 for key, res in self.result.items(): res= res.get() print(res) if 'test_rmse' in res.keys() and res['test_rmse'].tolist(): if np.mean(res['test_rmse'].tolist()) > MaxRMSE: index = key elif 'test_mae' in res.keys() and res['test_mae'].tolist(): if np.mean(res['test_mse'].tolist()) > MaxMAE: index = key else: index = 0 print('模型评估完毕, 最终选择' + str(index) + '号模型') print('run finish', datetime.datetime.now()) return self.models[index] except Exception as ex: Logger('error').get_log().error(ex) return None
def connect(self, name): try: self.myclient = pymongo.MongoClient("mongodb://" + mongo_config['host'] + ':' + mongo_config['port']) self.db = self.myclient[name] except Exception as e: Logger('error').get_log().error(e)
def insert(self, index, datas:list): actions = [] for data in datas: action = { "_index": index, "_type": type, "_id": None, "_source": data } actions.append(action) startime = datetime.datetime.now() if len(actions): try: helpers.bulk(self._es, actions, request_timeout = 100) Logger().get_log().error(startime + '开始' + '本次共写入{}条数据'.format(len(actions))) except Exception as ex: Logger('error').get_log().error(ex)
def insert(self, table): try: collist = self.db.list_collection_names() if table in collist: print('数据表已经存在,无需重复创建') return self.db[table] except Exception as e: Logger('error').get_log().error(e)
def drop(self, table): try: collist = self.db.list_collection_names() if table not in collist: print('删除{}表失败'.format(table)) else: self.db[table].drop() print("删除{}表成功".format(table)) except Exception as ex: Logger('error').get_log().error(ex)
def predict(self, k=4): ''' :param user: 类比对象 :param userid:用来确定预测系统 :param k: 相似对象数 :return: 相似对象标签 ''' try: raw_id = self.name_to_rid[self.target] inner_id = self.algo.trainset.to_inner_iid(raw_id) neighbors = self.algo.get_neighbors(inner_id, k=k) neighbors = (self.algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors) neighbors = (self.rid_to_name[rid] for rid in neighbors) return neighbors if len(neighbors) < 10 else neighbors[:10] except PredictionImpossible as pl: Logger('error').get_log().error(pl) except Exception as ex: Logger('error').get_log().error(ex)
def rank(self, person, n=5): try: # 求参数用户和其他所有用户的相似系数 scores = [(self.sim_distance(person, other), other) for other in self.prefs if other != person] # 排序,默认根据元组第一个元素 scores.sort(reverse=True) return scores[0:n] if len(scores) <= n else scores except Exception as ex: Logger('error').get_log().error(ex)
def read(self): try: content = [] with open(self.path, 'r', encoding='utf-8') as f: readlines = csv.reader(f) for line in readlines: content.append(line) return content except Exception as ex: Logger('error').get_log().error(ex)
def read(self, path): ''' :param path: 文件路径 :return: dict ''' try: with open(path, 'r') as f: return json.load(fp = f) except Exception as ex: Logger('error').get_log().error(ex)
def dbSave(self, dbname='test'): try: Logger().get_log().error('将数据库信息转化为bunch对象') pathdict = Path(dbname).get_PathDict() catelist = os.listdir(pathdict['DataBasePath']) bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) # 将类别保存到Bunch对象中 mongoDB = MongoDB() mongoDB.connect(dbname) for table in catelist: collection = mongoDB.insert(table) for item in collection.find(): bunch.label.append(item['type']) bunch.filenames.append(item['file_name']) bunch.contents.append(item['content'].strip()) except Exception as ex: Logger('error').get_log().error(ex) finally: return bunch
def jsonReader(self, rating_scale =(1, 5)): ''' json数据格式: {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1]} :param rating_scale: :return: ''' try: dic = Json(self.config['path']).read() dl = list(dic.keys()) df = pd.DataFrame(dic) self.reader = Reader(rating_scale=rating_scale) # 传入的列必须对应着 userID,itemID 和 rating(严格按此顺序)。 return Dataset.load_from_df(df[[label for label in dl]], reader=self.reader) except Exception as ex: Logger('error').get_log().error(ex) Logger('error').clear()
def read(self, line_format, sep=','): ''' :param line_format: 'user item rating' :param sep: :return: ''' try: reader = Reader(line_format=line_format, sep=sep) return Dataset.load_from_file(self.path, reader=reader) except Exception as ex: Logger('error').get_log().error(ex)
def documents_Init(self, DataBasePath, index_name='test'): ''' 在index_name数据库下创建表并添加数据 :param Data_BasePath: 数据文件夹根目录 :param index_name: 数据库名称 :return: ''' print('将所有文档存储到elasticsearch') folder, txtFile = Folder(), TXTFile() indexnames = os.listdir(DataBasePath) save_dict = [] content = folder.read('txt', txtFile) for item in content: result = (str(content)).replace("\r\n", "").strip() # 删除多余空行与空格 cutResult = jieba.cut(result) # 默认方式分词,分词结果用空格隔开 # save_dict.append( # {'file_name': ChildPath, 'content': result, 'type': name, 'keywords': ' '.join(cutResult)}) try: self.insert(index_name, save_dict) # 将数据批量导入elasticsearch except Exception as ex: Logger('error').get_log().error(ex) else: Logger().get_log().error('索引初始化完成')
def build(self): try: if self.opts['engine'] == 'TPEngine': pass elif self.opts['engine'] == 'RSEngine': if 'option' in self.opts.keys() and self.opts['option']: module_meta = __import__('Recommand', globals(), locals(), [self.opts['option']], level=1) class_meta = getattr(module_meta, self.opts['option']) obj = class_meta(self.opts) obj.run() except Exception as ex: Logger('error').get_log().error(ex)
def bunchSave(self, config, dbname='test'): try: print('将数据库信息转换为bunch对象') catelist = os.listdir(config['DataBasePath']) bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) # 将类别保存到Bunch对象中 for table in catelist: # init_Table(db, table) collection = self.db[table] for item in collection.find(): bunch.label.append(item['type']) bunch.filenames.append(item['file_name']) bunch.contents.append(item['content'].strip()) return bunch except Exception as ex: Logger('error').get_log().error(ex)
def init_Path(self): #创建文件夹 try: Path_Dict[self.database] = { "DataBasePath": DataBasePath + "\\" + self.database + "\\data", "TestBasePath": DataBasePath + "\\" + self.database + "\\test", "inputpath": DataBasePath + "\\" + self.database + "\\data", "outputpath": DataBasePath + "\\" + self.database + "\\segResult", "trainset": DataBasePath + "\\" + self.database + "\\trainset.dat", "tfidfspace": DataBasePath + "\\" + self.database + "\\tfidfspace.dat", "testbunch": DataBasePath + "\\" + self.database + "\\test_set.dat", "predictspace": DataBasePath + "\\" + self.database + "\\predict.dat", "stopwords": DataBasePath + "\\" + self.database + "\\stopword.txt" } except Exception as ex: Logger('error').get_log().error(ex) return Path_Dict[self.database]
def bunchSave(inputFile, outputFile): print('run bunchSave', time.time()) start = time.time() catelist = os.listdir(inputFile) bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) # 将类别保存到Bunch对象中 for eachDir in catelist: eachPath = inputFile + "/" + eachDir + "/" fileList = os.listdir(eachPath) for eachFile in fileList: # 二级目录中的每个子文件 fullName = eachPath + eachFile # 二级目录子文件全路径 bunch.label.append(eachDir) # 当前分类标签 bunch.filenames.append(fullName) # 保存当前文件的路径 bunch.contents.append(TXTFile(fullName).read().strip()) # 保存文件词向量 with open(outputFile, 'wb') as file_obj: # 持久化必须用二进制访问模式打开 pickle.dump(bunch, file_obj) #文件可以不建立,但是文件夹必须建立 #pickle.dump(obj, file, [,protocol])函数的功能:将obj对象序列化存入已经打开的file中。 #obj:想要序列化的obj对象。 #file:文件名称。 #protocol:序列化使用的协议。如果该项省略,则默认为0。如果为负值或HIGHEST_PROTOCOL,则使用最高的协议版本 Logger().get_log().error('finish bunch save + ', time.time(), ', use time :' + str(start - time.time()))
def train(self): try: trainset = self.data.build_full_trainset() self.algo.fit(trainset) except Exception as ex: Logger('error').get_log().error(ex)
def search_many(self, collection, target: dict): try: return self.db[collection].find(target) except Exception as ex: Logger('error').get_log().error(ex)
def run(self): try: self.getRS() self.filter() except Exception as ex: Logger('error').get_log().error(ex)
def update(self, collection, filter, update): try: return self.db[collection].find_one_and_update( filter, {'$set': update}) except Exception as ex: Logger('error').get_log().error(ex)