Exemplo n.º 1
0
def main(task, num_docs, top_k):
    config()
    workspace = os.environ['JINA_WORKSPACE']
    if 'index' in task:
        if os.path.exists(workspace):
            logger.error(
                f'\n +------------------------------------------------------------------------------------+ \
                    \n |                                   🤖🤖🤖                                           | \
                    \n | The directory {workspace} already exists. Please remove it before indexing again.  | \
                    \n |                                   🤖🤖🤖                                           | \
                    \n +------------------------------------------------------------------------------------+'
            )
            sys.exit(1)
    if 'query' in task:
        if not os.path.exists(workspace):
            print(f'The directory {workspace} does not exist. Please index first via `python app.py -t index`')
            sys.exit(1)
    if task == 'index':
        index(num_docs)
    elif task == 'index_incremental':
        index_incremental(num_docs)
    elif task == 'query':
        query(top_k)
    elif task == 'query_restful':
        query_restful()
Exemplo n.º 2
0
def main(task, num_docs_index):
    config()
    workspace = os.environ['JINA_WORKSPACE']
    if 'index' in task:
        if os.path.exists(workspace):
            logger.error(
                f'\n +------------------------------------------------------------------------------------+ \
                    \n |                                   ������                                           | \
                    \n | The directory {workspace} already exists. Please remove it before indexing again.  | \
                    \n |                                   ������                                           | \
                    \n +------------------------------------------------------------------------------------+'
            )
            sys.exit(1)
    if task == 'index':
        index(num_docs_index)
    elif task == 'index_restful':
        index_restful(num_docs_index)
    elif task == 'query_restful':
        if not os.path.exists(workspace):
            logger.error(
                f'The directory {workspace} does not exist. Please index first via `python app.py -t index`'
            )
            sys.exit(1)
        query_restful()
    elif task == 'dryrun':
        dryrun()
Exemplo n.º 3
0
 def _parse_pdf(self, doc: Document):
     pdf_img = None
     pdf_text = None
     try:
         if doc.uri:
             pdf_img = fitz.open(doc.uri)
             pdf_text = pdfplumber.open(doc.uri)
         if doc.buffer:
             pdf_img = fitz.open(stream=doc.buffer, filetype='pdf')
             pdf_text = pdfplumber.open(io.BytesIO(doc.buffer))
     except Exception as ex:
         logger.error(f'Failed to open due to: {ex}')
     return pdf_img, pdf_text
Exemplo n.º 4
0
def download_mermaid_url(mermaid_url, output) -> None:
    """
    Download the jpg image from mermaid_url.

    :param mermaid_url: The URL of the image.
    :param output: A filename specifying the name of the image to be created, the suffix svg/jpg determines the file type of the output image.
    """
    try:
        req = Request(mermaid_url, headers={'User-Agent': 'Mozilla/5.0'})
        with open(output, 'wb') as fp:
            fp.write(urlopen(req).read())
    except:
        from jina.logging import default_logger
        default_logger.error('can not download image, please check your graph and the network connections')
Exemplo n.º 5
0
def download_mermaid_url(mermaid_url, output) -> None:
    """
    Rendering the current flow as a jpg image, this will call :py:meth:`to_mermaid` and it needs internet connection
    :param path: the file path of the image
    :param kwargs: keyword arguments of :py:meth:`to_mermaid`
    """
    try:
        req = Request(mermaid_url, headers={'User-Agent': 'Mozilla/5.0'})
        with open(output, 'wb') as fp:
            fp.write(urlopen(req).read())
    except:
        from jina.logging import default_logger
        default_logger.error(
            'can not download image, please check your graph and the network connections'
        )
Exemplo n.º 6
0
def index(num_docs: int) -> None:
    workspace = os.environ['JINA_WORKSPACE']
    if os.path.exists(workspace):
        logger.error(
            f'\n +---------------------------------------------------------------------------------+ \
                        \n |                                   ������                                        | \
                        \n | The directory {workspace} already exists. Please remove it before indexing again. | \
                        \n |                                   ������                                        | \
                        \n +---------------------------------------------------------------------------------+'
        )
        sys.exit(1)
    pdf_files = glob.glob(os.path.join(PDF_DATA_PATH, '*.pdf'))[:num_docs]
    f = Flow.load_config('flows/index.yml')
    with f:
        with TimeContext(f'QPS: indexing {len(pdf_files)}', logger=f.logger):
            f.post('/index', inputs=index_generator(pdf_files))
Exemplo n.º 7
0
def main(task: str, num_docs: int, force: bool):
    workspace = os.environ['JINA_WORKSPACE']
    if task == 'index':
        if os.path.exists(workspace):
            if force:
                shutil.rmtree(workspace)
            else:
                logger.error(
                    f'\n +----------------------------------------------------------------------------------+ \
                        \n |                                   ������                                         | \
                        \n | The directory {workspace} already exists. Please remove it before indexing again.  | \
                        \n |                                   ������                                         | \
                        \n +----------------------------------------------------------------------------------+'
                )
                sys.exit(1)
        index(num_docs)
    if task == 'query_restful':
        if not os.path.exists(workspace):
            logger.error(
                f'The directory {workspace} does not exist. Please index first via `python app.py -t index`'
            )
            sys.exit(1)
        query_restful()