def __init__(self, maxsize=32): self.input_queue = Queue(maxsize=maxsize) self.text2vec = BertEncode(graph_path=None) t = threading.Thread(target=self.run) t.setDaemon(True) t.start() logger.info("\033[1;32mbert initialize ok\033[0m")
def found_in_intersect(status: Status, history: History, rev_root_path: str) -> bool: """Whether wiki race end path was found in newly discovered links. If the wiki race end path was discovered through a page both searches found (an intersection) Results traversed path: path of current search + (path of reverse search).reversed() This is because one computes forward based on links and the other backwards based on links_to Finalize results: by sending the finalized results traversed path to status of the search & reverse search Args: status: Status of current search. history: History of current search. rev_root_path: The root_path of the same search going in reverse. """ status_rev = Status(status_db, rev_root_path) intersection = history.traversed_intersection(status.root_path, rev_root_path) if intersection: path_to_goal = history.intersection_path(status.root_path, rev_root_path) status.finalize_results(path_to_goal) path_to_goal_rev = path_to_goal.copy() path_to_goal_rev.reverse() # also set results in the reverse search db status_rev.finalize_results(path_to_goal_rev) logger.info( f"Intersection End link found!! path traversed and time to complete: {path_to_goal} or {path_to_goal_rev}" ) return True return False
def found_in_page(status: Status, history: History, all_links: List[str], rev_root_path: str) -> bool: """Whether wiki race end path was found in newly discovered links. If the wiki race end path was discovered on current page: Results traversed path: end path appended to the current query page's traversed path. Finalize results: by sending the finalized results traversed path to status of the search & reverse search Args: status: Status of current search. history: History of current search. all_links: List of new links discovered on current query page. rev_root_path: The root_path of the same search going in reverse. """ status_rev = Status(status_db, rev_root_path) if status.end_path in all_links: path = history.traversed_path.copy() path.append(status.end_path) status.finalize_results(path) path_rev = path.copy() path_rev.reverse() # also set results in the reverse search db status_rev.finalize_results(path_rev) logger.info( f"End link found!! path traversed and time to complete: {path} or {path_rev}" ) return True return False
def run(self): inputs = [] tokens = [] start = time.time() while True: try: respect = self.input_queue.get(block=True, timeout=0.001) token = respect.token sentences = respect.values inputs.extend(sentences) tokens.extend([token] * len(sentences)) except Empty as e: continue except Exception as e: logger.error(str(e)) continue finally: end = time.time() interval = end - start if len(inputs) > 64 or (interval > 0.1 and len(inputs) > 0): logger.info(f"batch size: {len(inputs)}, time: {interval}") if len(inputs) > 512: vectors = [] n = int(len(inputs) // 64) + 1 for i in range(n): sentences = inputs[i * 64:(i + 1) * 64] if len(sentences) == 0: continue vector = self.text2vec.encode(sentences) vectors.append(vector) vectors = np.concatenate(vectors, axis=0) else: vectors = self.text2vec.encode(inputs) info = pd.DataFrame(np.array([tokens, inputs]).T, columns=["tokens", "inputs" ]).groupby("tokens").indices reply_vectors = { k: [vectors[i] for i in v] for k, v in info.items() } self.output_queue.update(reply_vectors) inputs = [] tokens = [] start = time.time()
import sys import spacy from common.config import get_celery_app, logger, SPACY_LANG, SPACY_LOCAL app = get_celery_app() ############################################################# # SpaCy setup logger.info("Loading NLP Info...") # spacy file if downloaded and stored locally spacy_file = f"assets/{SPACY_LANG}-2.2.5/{SPACY_LANG}/{SPACY_LANG}-2.2.5" if "pytest" in sys.modules: spacy_file = ( f"../nlp/assets/{SPACY_LANG}-2.2.5/{SPACY_LANG}/{SPACY_LANG}-2.2.5") # use this if you downloaded spacy and stored it locally # nlp = spacy.load(spacy_file) # reads spacy model downloaded upon build if SPACY_LOCAL == 'local': nlp = spacy.load(spacy_file) else: nlp = spacy.load(SPACY_LANG) stop_list = set("for a of the and to in go list".split()) #############################################################
def __init__(self): super().__init__() self.text2vec = BertEncode(graph_path=None) logger.info("\033[1;32mbert initialize ok\033[0m")
from common.config import logger from bert.bert2vec import BertEncode from grpc_base import bert_server_queue_pb2, bert_server_queue_pb2_grpc import platform import sys import pandas as pd sys.path.append("../../") sys = platform.system() if sys == "Linux": # 自动选择空闲显卡 os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp') memory_gpu = [int(x.split()[2]) for x in open('tmp', 'r').readlines()] gpu_id = memory_gpu.index(max(memory_gpu)) os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" logger.info(f"\033[1;32m使用{gpu_id}号gpu\033[0m") # 实现 proto 文件中定义的 BertServetServicer class Greeter(bert_server_queue_pb2_grpc.BertServetServicer): def __init__(self): super().__init__() self.text2vec = BertEncode(graph_path=None) logger.info("\033[1;32mbert initialize ok\033[0m") def get_vectors(self, request, context): inputs = [] tokens = [] start = time.time() for texts in request: inputs.extend(texts.sentences)