Exemplo n.º 1
0
 def __init__(self, maxsize=32):
     self.input_queue = Queue(maxsize=maxsize)
     self.text2vec = BertEncode(graph_path=None)
     t = threading.Thread(target=self.run)
     t.setDaemon(True)
     t.start()
     logger.info("\033[1;32mbert initialize ok\033[0m")
Exemplo n.º 2
0
def found_in_intersect(status: Status, history: History,
                       rev_root_path: str) -> bool:
    """Whether wiki race end path was found in newly discovered links.

        If the  wiki race end path was discovered through a page both searches found (an intersection)
        Results traversed path: path of current search + (path of reverse search).reversed()
        This is because one computes forward based on links and the other backwards based on links_to
        Finalize results: by sending the finalized results traversed path to status of the search & reverse search

        Args:
            status: Status of current search.
            history: History of current search.
            rev_root_path: The root_path of the same search going in reverse.
        """
    status_rev = Status(status_db, rev_root_path)
    intersection = history.traversed_intersection(status.root_path,
                                                  rev_root_path)
    if intersection:
        path_to_goal = history.intersection_path(status.root_path,
                                                 rev_root_path)
        status.finalize_results(path_to_goal)
        path_to_goal_rev = path_to_goal.copy()
        path_to_goal_rev.reverse()
        # also set results in the reverse search db
        status_rev.finalize_results(path_to_goal_rev)
        logger.info(
            f"Intersection End link found!! path traversed and time to complete: {path_to_goal} or {path_to_goal_rev}"
        )
        return True
    return False
Exemplo n.º 3
0
def found_in_page(status: Status, history: History, all_links: List[str],
                  rev_root_path: str) -> bool:
    """Whether wiki race end path was found in newly discovered links.

    If the  wiki race end path was discovered on current page:
    Results traversed path: end path appended to the current query page's traversed path.
    Finalize results: by sending the finalized results traversed path to status of the search & reverse search

    Args:
        status: Status of current search.
        history: History of current search.
        all_links: List of new links discovered on current query page.
        rev_root_path: The root_path of the same search going in reverse.
    """
    status_rev = Status(status_db, rev_root_path)
    if status.end_path in all_links:
        path = history.traversed_path.copy()
        path.append(status.end_path)
        status.finalize_results(path)
        path_rev = path.copy()
        path_rev.reverse()
        # also set results in the reverse search db
        status_rev.finalize_results(path_rev)
        logger.info(
            f"End link found!! path traversed and time to complete: {path} or {path_rev}"
        )
        return True
    return False
Exemplo n.º 4
0
 def run(self):
     inputs = []
     tokens = []
     start = time.time()
     while True:
         try:
             respect = self.input_queue.get(block=True, timeout=0.001)
             token = respect.token
             sentences = respect.values
             inputs.extend(sentences)
             tokens.extend([token] * len(sentences))
         except Empty as e:
             continue
         except Exception as e:
             logger.error(str(e))
             continue
         finally:
             end = time.time()
             interval = end - start
             if len(inputs) > 64 or (interval > 0.1 and len(inputs) > 0):
                 logger.info(f"batch size: {len(inputs)}, time: {interval}")
                 if len(inputs) > 512:
                     vectors = []
                     n = int(len(inputs) // 64) + 1
                     for i in range(n):
                         sentences = inputs[i * 64:(i + 1) * 64]
                         if len(sentences) == 0:
                             continue
                         vector = self.text2vec.encode(sentences)
                         vectors.append(vector)
                     vectors = np.concatenate(vectors, axis=0)
                 else:
                     vectors = self.text2vec.encode(inputs)
                 info = pd.DataFrame(np.array([tokens, inputs]).T,
                                     columns=["tokens", "inputs"
                                              ]).groupby("tokens").indices
                 reply_vectors = {
                     k: [vectors[i] for i in v]
                     for k, v in info.items()
                 }
                 self.output_queue.update(reply_vectors)
                 inputs = []
                 tokens = []
                 start = time.time()
Exemplo n.º 5
0
import sys

import spacy

from common.config import get_celery_app, logger, SPACY_LANG, SPACY_LOCAL

app = get_celery_app()

#############################################################
# SpaCy setup

logger.info("Loading NLP Info...")
# spacy file if downloaded and stored locally
spacy_file = f"assets/{SPACY_LANG}-2.2.5/{SPACY_LANG}/{SPACY_LANG}-2.2.5"

if "pytest" in sys.modules:
    spacy_file = (
        f"../nlp/assets/{SPACY_LANG}-2.2.5/{SPACY_LANG}/{SPACY_LANG}-2.2.5")

# use this if you downloaded spacy and stored it locally
# nlp = spacy.load(spacy_file)

# reads spacy model downloaded upon build
if SPACY_LOCAL == 'local':
    nlp = spacy.load(spacy_file)
else:
    nlp = spacy.load(SPACY_LANG)

stop_list = set("for a of the and to in go list".split())

#############################################################
Exemplo n.º 6
0
 def __init__(self):
     super().__init__()
     self.text2vec = BertEncode(graph_path=None)
     logger.info("\033[1;32mbert initialize ok\033[0m")
Exemplo n.º 7
0
from common.config import logger
from bert.bert2vec import BertEncode
from grpc_base import bert_server_queue_pb2, bert_server_queue_pb2_grpc
import platform
import sys
import pandas as pd

sys.path.append("../../")
sys = platform.system()
if sys == "Linux":
    # 自动选择空闲显卡
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp')
    memory_gpu = [int(x.split()[2]) for x in open('tmp', 'r').readlines()]
    gpu_id = memory_gpu.index(max(memory_gpu))
    os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
    logger.info(f"\033[1;32m使用{gpu_id}号gpu\033[0m")


# 实现 proto 文件中定义的 BertServetServicer
class Greeter(bert_server_queue_pb2_grpc.BertServetServicer):
    def __init__(self):
        super().__init__()
        self.text2vec = BertEncode(graph_path=None)
        logger.info("\033[1;32mbert initialize ok\033[0m")

    def get_vectors(self, request, context):
        inputs = []
        tokens = []
        start = time.time()
        for texts in request:
            inputs.extend(texts.sentences)