def __proportional_function(self, doc, token, new_topic): if isinstance(doc, LDADoc) and isinstance(token, Token): old_topic = token.topic dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha() wt_beta = self.__model.word_topic_value(token.id, new_topic) + self.__model.beta() t_sum_beta_sum = self.__model.topic_sum_value(new_topic) + self.__model.beta_sum() if new_topic == old_topic and wt_beta > 1: if dt_alpha > 1: dt_alpha -= 1 wt_beta -= 1 t_sum_beta_sum -= 1 return dt_alpha * wt_beta / t_sum_beta_sum elif isinstance(doc, SLDADoc) and isinstance(token, Sentence): sent = token old_topic = sent.topic result = doc.topic_sum(new_topic) + self.__model.alpha() if new_topic == old_topic: result -= 1 for word_id in sent.tokens: wt_beta = self.__model.word_topic_value(word_id, new_topic) + self.__model.beta() t_sum_beta_sum = self.__model.topic_sum_value(new_topic) + self.__model.beta_sum() if new_topic == old_topic and wt_beta > 1: wt_beta -= 1 t_sum_beta_sum -= 1 result *= wt_beta / t_sum_beta_sum return result else: logger.error("Wrong input argument type!")
def infer(self, input, doc): """Perform LDA topic inference on input, and store the results in doc. Args: input: a list of strings after tokenization. doc: LDADoc type or SLDADoc type. """ fix_random_seed() if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc): doc.init(self.__model.num_topics()) doc.set_alpha(self.__model.alpha()) for token in input: id_ = self.__model.term_id(token) if id_ != OOV: init_topic = rand_k(self.__model.num_topics()) doc.add_token(Token(init_topic, id_)) self.lda_infer(doc, 20, 50) elif isinstance(doc, SLDADoc): doc.init(self.__model.num_topics()) doc.set_alpha(self.__model.alpha()) for sent in input: words = [] for token in sent: id_ = self.__model.term_id(token) if id_ != OOV: words.append(id_) init_topic = rand_k(self.__model.num_topics()) doc.add_sentence(Sentence(init_topic, words)) self.slda_infer(doc, 20, 50) else: logger.error("Wrong Doc Type!")
def check_server(self): for server in self.server_list: client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_ip = server.split(':')[0] server_port = int(server.split(':')[1]) client.connect((server_ip, server_port)) client.send(b'pending server') response = client.recv(1024).decode() response_list = response.split('\t') status_code = int(response_list[0].split(':')[1]) if status_code == 0: server_model = response_list[1].split(':')[1] if server_model == self.model_name: serving_port = response_list[2].split(':')[1] serving_ip = server_ip self.serving_list.append(serving_ip + ':' + serving_port) else: logger.error( 'model_name not match, server {} using : {} '.format( server, server_model)) else: error_msg = response_list[1] logger.error('connect server {} failed. {}'.format( server, error_msg))
def get_face_landmark(self, image): """ 预测人脸的68个关键点坐标 images(ndarray): 单张图片的像素数据 """ try: # 选择GPU运行,use_gpu=True,并且在运行整个教程代码之前设置CUDA_VISIBLE_DEVICES环境变量 res = self.module.keypoint_detection(images=[image], use_gpu=False) img = image tmp_img = image.copy() for index, point in enumerate(res[0]['data'][0]): # cv2.putText(img, str(index), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_COMPLEX, 3, (0,0,255), -1) cv2.circle(tmp_img, (int(point[0]), int(point[1])), 2, (0, 0, 255), -1) res_img_path = 'face_landmark.jpg' cv2.imwrite(res_img_path, tmp_img) img = mpimg.imread(res_img_path) # 展示预测68个关键点结果 #下面用于绘图 # plt.figure(figsize=(10,10)) # plt.imshow(img) # plt.axis('off') #plt.show() return True, res[0]['data'][0],img except Exception as e: logger.error("Get face landmark localization failed! Exception: %s " % e) return False, None , None
def encode(self, text): if len(self.serving_list) == 0: logger.error('No match server.') return -1 if type(text) != list: raise TypeError('Only support list') request_msg = self.prepare_data(text) response_msg = self.request_server(request_msg) retry = 0 while type(response_msg) == str and response_msg == 'retry': if retry < self.retry: retry += 1 logger.info('Try to connect another servers') response_msg = self.request_server(request_msg) else: logger.error('Request failed after {} times retry'.format( self.retry)) break result = [] for msg in response_msg["instances"]: for sample in msg["instances"]: result.append(sample["values"]) return result
def _init_with_url(self, url): utils.check_url(url) result, tips, module_dir = default_downloader.download_file_and_uncompress( url, save_path=".") if not result: logger.error(tips) exit(1) self._init_with_module_file(module_dir)
def check_requirements(self): try: import shapely, pyclipper except: logger.error( 'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.' ) exit()
def get_face_landmark(self, image, use_gpu=False): try: res = self.module.keypoint_detection(images=[image], use_gpu=use_gpu) return True, res[0]['data'][0] except Exception as e: logger.error( "Get face landmark localization failed! Exception: %s " % e) return False, None
def _init_with_name(self, name, version=None): log_msg = "Installing %s module" % name if version: log_msg += "-%s" % version logger.info(log_msg) result, tips, module_dir = default_module_manager.install_module( module_name=name, module_version=version) if not result: logger.error(tips) exit(1) logger.info(tips) self._init_with_module_file(module_dir[0])
def get_face_landmark(self, image): """ 预测人脸的68个关键点坐标 images(ndarray): 单张图片的像素数据 """ try: res = self.module.keypoint_detection(images=[image]) return True, res[0]['data'][0] except Exception as e: logger.error( "Get face landmark localization failed! Exception: %s " % e) return False, None
def get_face_landmark(self, image): """ 预测人脸的68个关键点坐标 images(ndarray): 单张图片的像素数据 """ try: # 选择GPU运行,use_gpu=True,并且在运行整个教程代码之前设置CUDA_VISIBLE_DEVICES环境变量 res = self.module.keypoint_detection(images=[image], use_gpu=False) return True, res[0]['data'][0] except Exception as e: logger.error( "Get face landmark localization failed! Exception: %s " % e) return False, None
def _init_with_name(self, name, version=None): log_msg = "Installing %s module" % name if version: log_msg += "-%s" % version logger.info(log_msg) extra = {"command": "install"} result, tips, module_dir = default_module_manager.install_module( module_name=name, module_version=version, extra=extra) if not result: logger.error(tips) raise RuntimeError(tips) else: logger.info(tips) self._init_with_module_file(module_dir[0])
def load(self, vocab_file): self.__term2id = {} self.__id2term = {} with open(vocab_file, 'r', encoding='utf-8') as fin: for line in fin.readlines(): fields = line.strip().split('\t') assert len( fields) == 5, "Vocabulary file [%s] format error!" % ( vocab_file) term = fields[1] id_ = int(fields[2]) if term in self.__term2id: logger.error("Duplicate word [%s] in vocab file!" % (term)) continue self.__term2id[term] = id_ self.__id2term[id_] = term
def init_with_name(cls, name, version=None, **kwargs): fp_lock = open(os.path.join(CACHE_HOME, name), "a") lock.flock(fp_lock, lock.LOCK_EX) log_msg = "Installing %s module" % name if version: log_msg += "-%s" % version logger.info(log_msg) extra = {"command": "install"} result, tips, module_dir = default_module_manager.install_module( module_name=name, module_version=version, extra=extra) if not result: logger.error(tips) raise RuntimeError(tips) logger.info(tips) lock.flock(fp_lock, lock.LOCK_UN) return cls.init_with_directory(directory=module_dir[0], **kwargs)
def show_topic_keywords(self, topic_id, k=10): """ This interface returns first k keywords under specific topic. Args: topic_id(int): topic information we want to know. k(int): top k keywords. Returns: results(dict): contains specific topic's keywords and corresponding probability. """ EPS = 1e-8 results = {} if 0 <= topic_id < self.config.num_topics: k = min(k, len(self.topic_words[topic_id])) for i in range(k): prob = self.topic_words[topic_id][i].count / \ (self.topic_sum_table[topic_id] + EPS) results[self.vocabulary[self.topic_words[topic_id][i].word_id]] = prob return results else: logger.error("%d is out of range!" % topic_id)
def request_server(self, request_msg): if self.load_balance == 'round_robin': try: cur_con = httplib.HTTPConnection( self.serving_list[self.con_index]) cur_con.request('POST', "/BertService/inference", request_msg, {"Content-Type": "application/json"}) response = cur_con.getresponse() response_msg = response.read() response_msg = ujson.loads(response_msg) self.con_index += 1 self.con_index = self.con_index % len(self.serving_list) return response_msg except BaseException as err: logger.warning("Infer Error with server {} : {}".format( self.serving_list[self.con_index], err)) if len(self.serving_list) == 0: logger.error('All server failed, process will exit') return 'fail' else: self.con_index += 1 return 'retry' elif self.load_balance == 'random': try: random.seed() self.con_index = random.randint(0, len(self.serving_list) - 1) logger.info(self.con_index) cur_con = httplib.HTTPConnection( self.serving_list[self.con_index]) cur_con.request('POST', "/BertService/inference", request_msg, {"Content-Type": "application/json"}) response = cur_con.getresponse() response_msg = response.read() response_msg = ujson.loads(response_msg) return response_msg except BaseException as err: logger.warning("Infer Error with server {} : {}".format( self.serving_list[self.con_index], err)) if len(self.serving_list) == 0: logger.error('All server failed, process will exit') return 'fail' else: self.con_index = random.randint(0, len(self.serving_list) - 1) return 'retry' elif self.load_balance == 'bind': try: self.con_index = int(self.process_id) % len(self.serving_list) cur_con = httplib.HTTPConnection( self.serving_list[self.con_index]) cur_con.request('POST', "/BertService/inference", request_msg, {"Content-Type": "application/json"}) response = cur_con.getresponse() response_msg = response.read() response_msg = ujson.loads(response_msg) return response_msg except BaseException as err: logger.warning("Infer Error with server {} : {}".format( self.serving_list[self.con_index], err)) if len(self.serving_list) == 0: logger.error('All server failed, process will exit') return 'fail' else: self.con_index = int(self.process_id) % len( self.serving_list) return 'retry'
def _load_test_examples(self, version_2_with_negative=False, is_training=False): self.test_file = None logger.error("not test_file")
def __init__( self, num_classes, dataset=None, feed_list=None, # Deprecated data_reader=None, # Deprecated feature=None, token_feature=None, network=None, startup_program=None, config=None, hidden_units=None, metrics_choices="default"): """ Args: num_classes: total labels of the text classification task. feed_list(list): the variable name that will be feeded to the main program, Deprecated in paddlehub v1.8. data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader, Deprecated in paddlehub v1.8.. feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None. token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None. network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None. startup_program (object): the customized startup program, default None. config (RunConfig): run config for the task, such as batch_size, epoch, learning_rate setting and so on. Default None. hidden_units(list): the element of `hidden_units` list is the full-connect layer size. It will add the full-connect layers to the program. Default None. metrics_choices(list): metrics used to the task, default ["acc"]. Choices: acc, precision, recall, f1, matthews. """ if (not feature) and (not token_feature): logger.error( 'Both token_feature and feature are None, one of them must be set.' ) exit(1) elif feature and token_feature: logger.error( 'Both token_feature and feature are set. One should be set, the other should be None.' ) exit(1) if network: assert network in [ 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru', 'lstm' ], 'network (%s) choice must be one of bilstm, bow, cnn, dpcnn, gru, lstm!' % network assert token_feature and ( not feature ), 'If you wanna use network, you must set token_feature ranther than feature for TextClassifierTask!' assert len( token_feature.shape ) == 3, 'When you use network, the parameter token_feature must be the token-level feature([batch_size, max_seq_len, embedding_size]), shape as [-1, 128, 200].' else: assert feature and ( not token_feature ), 'If you do not use network, you must set feature ranther than token_feature for TextClassifierTask!' assert len( feature.shape ) == 2, 'When you do not use network, the parameter feture must be the sentence-level feature ([batch_size, hidden_size]), such as the pooled_output of ERNIE, BERT, RoBERTa and ELECTRA module.' self.network = network if metrics_choices == "default": metrics_choices = ["acc"] super(TextClassifierTask, self).__init__(dataset=dataset, data_reader=data_reader, feature=feature if feature else token_feature, num_classes=num_classes, feed_list=feed_list, startup_program=startup_program, config=config, hidden_units=hidden_units, metrics_choices=metrics_choices)