示例#1
0
def run(delete, load, search):
    ret = ElasticRetriever()
    if load:
        ret.build_index('contracts.parquet')
    if delete:
        ret.delete(dataset_id='contracts')
    if search != '':
        ret.search(search)
示例#2
0
def run(delete, load, search, entity_search, cls, host):
    ret = ElasticRetriever(hosts=[host])
    if load:
        ret.build_index('contracts.parquet')
    if delete:
        ret.delete(dataset_id='contracts')
    if search != '':
        result = ret.search(search,
                            entity_search=entity_search,
                            cls=cls,
                            ndocs=1)
        print(result)
示例#3
0
def create_app():
    app = Flask(__name__, instance_relative_config=True)
    app.config['JSON_SORT_KEYS'] = False
    app.config.from_mapping(SECRET_KEY='dev', )
    app.debug = True
    try:
        os.makedirs(app.instance_path)
    except OSError:
        pass
    app.retriever = ElasticRetriever(os.environ['ELASTIC_ADDRESS'])
    app.page_retriever = ElasticPageRetriever(os.environ['ELASTIC_ADDRESS'])
    try:
        app.word_embeddings_model = fasttext.load_model('/data/vecs.bin')
    except Exception as e:
        logger.error(f'{e}')
        pass

    from . import retrieval
    app.register_blueprint(retrieval.bp)

    # hack to get url prefixes registered as required/desired IAR - 30.Oct.2020
    if 'PREFIX' in os.environ:
        logging.info(f"Stripping {os.environ['PREFIX']}")
        prefix = os.environ['PREFIX']
    else:
        logging.info("No prefix stripped.")
        prefix = ''
    if "API_VERSION" in os.environ:
        api_version = os.environ['API_VERSION']
    else:
        api_version = 'v2_beta'
    app.register_blueprint(retrieval.bp, url_prefix=f"{prefix}/{api_version}")
    app.register_blueprint(
        retrieval.bp,
        url_prefix='/sets/xdd-covid-19/api')  # for backward compatibility
    app.register_blueprint(retrieval.bp,
                           url_prefix=f'/sets/xdd-covid-19/api/{api_version}'
                           )  # for backward compatibility

    #from . import extraction
    #app.register_blueprint(extraction.bp)

    from . import embeddings
    app.register_blueprint(embeddings.bp)
    logger.error(app.url_map)
    CORS(app)

    return app
示例#4
0
def run(dataset_id, aws_host, host):
    if aws_host != '':
        auth = AWS4Auth(os.environ.get('AWS_ACCESS_KEY_ID'),
                        os.environ.get('AWS_SECRET_ACCESS_KEY'),
                        os.environ.get('AWS_DEFAULT_REGION'),
                        'es',
                        session_token=os.environ.get('AWS_SESSION_TOKEN'))
        ret = ElasticRetriever(hosts=[{
            'host': aws_host,
            'port': 443
        }],
                               awsauth=auth)
    else:
        ret = ElasticRetriever(hosts=[host])
    print('Connected to retriever, building indices')
    ret.delete(dataset_id)
    print('Done deleting index')
示例#5
0
def run(sections_parquet, documents_parquet, tables_parquet, figures_parquet,
        equations_parquet, aws_host, host):
    if aws_host != '':
        auth = AWS4Auth(os.environ.get('AWS_ACCESS_KEY_ID'),
                        os.environ.get('AWS_SECRET_ACCESS_KEY'),
                        os.environ.get('AWS_DEFAULT_REGION'),
                        'es',
                        session_token=os.environ.get('AWS_SESSION_TOKEN'))
        ret = ElasticRetriever(hosts=[{
            'host': aws_host,
            'port': 443
        }],
                               awsauth=auth)
    else:
        ret = ElasticRetriever(hosts=[host])
    print('Connected to retriever, building indices')
    ret.build_index(documents_parquet, sections_parquet, tables_parquet,
                    figures_parquet, equations_parquet)
    print('Done building index')
 def __init__(self, client, hosts=[os.environ["ELASTIC_ADDRESS"]]):
     self.elastic_retriever = ElasticRetriever(hosts)
     self.reranker = BertRerankingRetriever(client)
class ElasticRerankingRetriever(Retriever):
    def __init__(self, client, hosts=[os.environ["ELASTIC_ADDRESS"]]):
        self.elastic_retriever = ElasticRetriever(hosts)
        self.reranker = BertRerankingRetriever(client)


    def search(self,
               query,
               ndocs=10,
               page=0,
               cls=None,
               detect_min=None,
               postprocess_min=None,
               return_all=False,
               get_count=False):
        logger.error('Starting search.')
        contexts = self.elastic_retriever.search(query,
                                                 ndocs=ndocs,
                                                 page=page,
                                                 cls=cls,
                                                 detect_min=detect_min,
                                                 postprocess_min=postprocess_min)
        if get_count:
            pdf_count = set()
            for c in contexts:
                pdf_count.add(c['pdf_name'])
            return len(pdf_count)
        logger.info('Starting reranking')
        results = self.rerank(query, contexts)
        logger.info('Finished reranking')
        if return_all:
            return results
        doc_set = set()
        final_results = []
        for result in results:
            if result['docname'] in doc_set:
                continue
            doc_set.add(result['docname'])
            final_results.append(result)
        final_results = [r['id'] for r in final_results]
        final_results = [self.elastic_retriever.get_object(i) for i in final_results]
        final_results = [
            {
                'header': {},
                'pdf_name': obj.pdf_name,
                'children': [{
                    'id': obj.meta.id,
                    'bytes': obj.img_pth,
                    'cls': obj.cls,
                    'postprocessing_confidence': obj.postprocess_score,
                    'base_confidence': obj.detect_score,
                    'content': obj.content,
                    'header_content': obj.header_content,
                }],
                'context_keywords': '',
                'context_summary': '',
                'context_content': '',
                'context_id': obj.meta.id
            } for obj in final_results
        ]
        return final_results


    def rerank(self, query, contexts):
        return self.reranker.rerank(query, contexts)

    def build_index(self, document_parquet, entities_parquet, section_parquet, tables_parquet, figures_parquet, equations_parquet):
        self.elastic_retriever.build_index(document_parquet, entities_parquet, section_parquet, tables_parquet, figures_parquet, equations_parquet)

    def delete(self, dataset_id):
        self.elastic_retriever.delete(dataset_id)