def main(task, num_docs, top_k): config() workspace = os.environ['JINA_WORKSPACE'] if 'index' in task: if os.path.exists(workspace): logger.error( f'\n +------------------------------------------------------------------------------------+ \ \n | 🤖🤖🤖 | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | 🤖🤖🤖 | \ \n +------------------------------------------------------------------------------------+' ) sys.exit(1) if 'query' in task: if not os.path.exists(workspace): print(f'The directory {workspace} does not exist. Please index first via `python app.py -t index`') sys.exit(1) if task == 'index': index(num_docs) elif task == 'index_incremental': index_incremental(num_docs) elif task == 'query': query(top_k) elif task == 'query_restful': query_restful()
def main(task, num_docs_index): config() workspace = os.environ['JINA_WORKSPACE'] if 'index' in task: if os.path.exists(workspace): logger.error( f'\n +------------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +------------------------------------------------------------------------------------+' ) sys.exit(1) if task == 'index': index(num_docs_index) elif task == 'index_restful': index_restful(num_docs_index) elif task == 'query_restful': if not os.path.exists(workspace): logger.error( f'The directory {workspace} does not exist. Please index first via `python app.py -t index`' ) sys.exit(1) query_restful() elif task == 'dryrun': dryrun()
def _parse_pdf(self, doc: Document): pdf_img = None pdf_text = None try: if doc.uri: pdf_img = fitz.open(doc.uri) pdf_text = pdfplumber.open(doc.uri) if doc.buffer: pdf_img = fitz.open(stream=doc.buffer, filetype='pdf') pdf_text = pdfplumber.open(io.BytesIO(doc.buffer)) except Exception as ex: logger.error(f'Failed to open due to: {ex}') return pdf_img, pdf_text
def download_mermaid_url(mermaid_url, output) -> None: """ Download the jpg image from mermaid_url. :param mermaid_url: The URL of the image. :param output: A filename specifying the name of the image to be created, the suffix svg/jpg determines the file type of the output image. """ try: req = Request(mermaid_url, headers={'User-Agent': 'Mozilla/5.0'}) with open(output, 'wb') as fp: fp.write(urlopen(req).read()) except: from jina.logging import default_logger default_logger.error('can not download image, please check your graph and the network connections')
def download_mermaid_url(mermaid_url, output) -> None: """ Rendering the current flow as a jpg image, this will call :py:meth:`to_mermaid` and it needs internet connection :param path: the file path of the image :param kwargs: keyword arguments of :py:meth:`to_mermaid` """ try: req = Request(mermaid_url, headers={'User-Agent': 'Mozilla/5.0'}) with open(output, 'wb') as fp: fp.write(urlopen(req).read()) except: from jina.logging import default_logger default_logger.error( 'can not download image, please check your graph and the network connections' )
def index(num_docs: int) -> None: workspace = os.environ['JINA_WORKSPACE'] if os.path.exists(workspace): logger.error( f'\n +---------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +---------------------------------------------------------------------------------+' ) sys.exit(1) pdf_files = glob.glob(os.path.join(PDF_DATA_PATH, '*.pdf'))[:num_docs] f = Flow.load_config('flows/index.yml') with f: with TimeContext(f'QPS: indexing {len(pdf_files)}', logger=f.logger): f.post('/index', inputs=index_generator(pdf_files))
def main(task: str, num_docs: int, force: bool): workspace = os.environ['JINA_WORKSPACE'] if task == 'index': if os.path.exists(workspace): if force: shutil.rmtree(workspace) else: logger.error( f'\n +----------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +----------------------------------------------------------------------------------+' ) sys.exit(1) index(num_docs) if task == 'query_restful': if not os.path.exists(workspace): logger.error( f'The directory {workspace} does not exist. Please index first via `python app.py -t index`' ) sys.exit(1) query_restful()