def __init__(self, tokenizer, reader_model=None, batch_size=64, qclassifier=None, cuda=False): """ Args drqa (string): """ print ("Tokenizer", tokenizer) Answerer.__init__(self,qclassifier) self.batch_size = batch_size self.n_docs = 5 self.top_n = 1 self.ts = TextSimilarity() print ("Reader model", reader_model, cuda) self.drqa = pipeline.DrQA( reader_model=reader_model, fixed_candidates=None, embedding_file=None, tokenizer="spacy", batch_size=batch_size, cuda=cuda, data_parallel=False, ranker_config={'options': {'tfidf_path': None, 'strict': False}}, db_config={'options': {'db_path': None}}, num_workers=1, )
def __init__(self, reader, retriever, doc_db): self.DrQA = pipeline.DrQA( cuda=True, fixed_candidates=None, reader_model=reader, ranker_config={'options': { 'tfidf_path': retriever }}, db_config={'options': { 'db_path': doc_db }}, tokenizer=None)
def __init__(self, wiki_path, tfidf_path): DEFAULTS['db_path'] = wiki_path DEFAULTS['tfidf_path'] = tfidf_path # 输出日志 logger = logging.getLogger() logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) self.qa = pipeline.DrQA()
if args.candidate_file: logger.info('Loading candidates from %s' % args.candidate_file) candidates = set() with open(args.candidate_file) as f: for line in f: line = utils.normalize(line.strip()).lower() candidates.add(line) logger.info('Loaded %d candidates.' % len(candidates)) else: candidates = None logger.info('Initializing pipeline...') DrQA = pipeline.DrQA(cuda=args.cuda, fixed_candidates=candidates, reader_model=args.reader_model, ranker_config={'class': OnsDocRanker}, db_config={'class': OnsSearchDB}, tokenizer=args.tokenizer) # ------------------------------------------------------------------------------ # Drop in to interactive mode # ------------------------------------------------------------------------------ def process(question, candidates=None, top_n=1, n_docs=100): predictions = DrQA.process(question, candidates, top_n, n_docs, return_context=True) table = prettytable.PrettyTable(
candidates = set() with open(args.candidate_file) as f: for line in f: line = utils.normalize(line.strip()).lower() candidates.add(line) logger.info('Loaded %d candidates......' % len(candidates)) else: candidates = None logger.info('Initializing pipeline......') DrQA = pipeline.DrQA( cuda=args.cuda, fixed_candidates=candidates, reader_model=args.reader_model, ranker_config={'options': { 'tfidf_path': args.retriever_model }}, db_config={'options': { 'db_path': args.doc_db }}, tokenizer=args.tokenizer) # ------------------------------------------------------------------------------ # Drop in to interactive mode # ------------------------------------------------------------------------------ def process(question, candidates=None, top_n=1, n_docs=5): predictions = DrQA.process(question, candidates, top_n,
} cuda = torch.cuda.is_available() and not config.get('no-cuda', False) if cuda: torch.cuda.set_device(config.get('gpu', 0)) logger.info('CUDA enabled (GPU %d)' % config.get('gpu', 0)) else: logger.info('Running on CPU only.') logger.info('Initializing pipeline...') DrQA = pipeline.DrQA( cuda=cuda, reader_model=config['reader-model'], ranker_config={'options': { 'tfidf_path': config['retriever-model'] }}, db_config={'options': { 'db_path': config['doc-db'] }}, tokenizer=config['tokenizer'], embedding_file=config['embedding-file'], ) def process(question, candidates=None, top_n=1, n_docs=5): predictions = DrQA.process(question, candidates, top_n, n_docs, return_context=True) answers = [] for i, p in enumerate(predictions, 1):
logger.info('Loaded %d candidates.' % len(candidates)) else: candidates = None logger.info('Initializing pipeline...') DrQA = pipeline.DrQA( reader_model=args.reader_model, fixed_candidates=candidates, embedding_file=args.embedding_file, tokenizer=args.tokenizer, batch_size=args.batch_size, cuda=args.cuda, data_parallel=args.parallel, ranker_config={ 'options': { 'tfidf_path': args.retriever_model, 'strict': False } }, #ranker_config={'options': {'index_path': args.retriever_model, #'strict': False}}, db_config={'options': { 'db_path': args.doc_db }}, num_workers=args.num_workers, ) # ------------------------------------------------------------------------------ # Read in dataset and make predictions # ------------------------------------------------------------------------------
with open(args.candidate_file) as f: for line in f: line = utils.normalize(line.strip()).lower() candidates.add(line) logger.info('Loaded %d candidates.' % len(candidates)) else: candidates = None ''' @app.route('/get_query', methods=['GET']) def process(): #DrQA = pipeline.DrQA() req = request.args question = req['query'] #DrQA = pipeline.DrQA() predictions = DrQA.process(question, candidates=None, top_n=1, n_docs=3, return_context=True) print(predictions[0]['span']) return predictions[0]['span'] if __name__ == '__main__': DrQA = pipeline.DrQA() app.run(host='0.0.0.0', debug=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--reader-model', type=str, default=None, help='Path to trained Document Reader model') parser.add_argument('--retriever-model', type=str, default=None, help='Path to Document Retriever model (tfidf)') parser.add_argument('--doc-db', type=str, default=None, help='Path to Document DB') parser.add_argument( '--tokenizer', type=str, default=None, help="String option specifying tokenizer type to use (e.g. 'corenlp')") parser.add_argument( '--candidate-file', type=str, default=None, help= "List of candidates to restrict predictions to, one candidate per line" ) parser.add_argument('--no-cuda', action='store_true', help="Use CPU only") parser.add_argument('--gpu', type=int, default=-1, help="Specify GPU device id to use") parser.add_argument( '--skip-to', metavar='QID', help='Start from topic QID and skip over all the previous ones') parser.add_argument('--use-desc-topics', metavar='FILE', help='Use desc queries pulled from FILE instead') parser.add_argument('corpus_json') parser.add_argument('output_json') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: torch.cuda.set_device(args.gpu) logger.info('CUDA enabled (GPU %d)' % args.gpu) else: logger.info('Running on CPU only.') if args.candidate_file: logger.info('Loading candidates from %s' % args.candidate_file) candidates = set() with open(args.candidate_file) as f: for line in f: line = utils.normalize(line.strip()).lower() candidates.add(line) logger.info('Loaded %d candidates.' % len(candidates)) else: candidates = None logger.info('Loading query topics from %s (may take a while)' % args.corpus_json) topics = [rl['topic'] for rl in json.load(smart_open(args.corpus_json))] logger.info('Loaded %d topics.' % len(topics)) if args.skip_to: found = None for i in range(len(topics)): if topics[i]['qid'] == args.skip_to: found = i break if found is None: topics = [] else: topics = topics[found:] if args.use_desc_topics: logger.info('Loading desc topics (to override title queries)') desc_topics = { t['qid']: t for t in json.load(smart_open(args.use_desc_topics)) } for i in range(len(topics)): qid = topics[i]['qid'] logger.info('%s: %s => %s' % (qid, topics[i]['title'], desc_topics[qid]['desc'])) topics[i]['title'] = desc_topics[qid]['desc'] logger.info('Initializing pipeline...') DrQA = pipeline.DrQA( cuda=args.cuda, fixed_candidates=candidates, reader_model=args.reader_model, ranker_config={ 'class': RetrievedDocRanker, 'options': { 'topics': topics } }, db_config={'options': { 'db_path': args.doc_db }}, tokenizer=args.tokenizer, num_workers=16, max_loaders=2, ) # ------------------------------------------------------------------------------ # Drop in to interactive mode # ------------------------------------------------------------------------------ title_queries = [topic['title'] for topic in topics] output = smart_open(args.output_json, 'a') ranked_lists = [] for topic in topics: predictions = DrQA.process(topic['title'], None, top_n=100, n_docs=100, return_context=True) passages = {} psg_scores = {} for p in predictions: docno = p['doc_id'][p['doc_id'].find('.') + 1:] passages[docno] = p['context']['text'] psg_scores[docno] = p['span_score'] res = { 'qid': topic['qid'], 'title': topic['title'], 'scores': topic['scores'], 'psg_scores': psg_scores, 'passages': passages } ranked_lists.append(res) logger.info('qid %s, %d passage scores returned' % (res['qid'], len(res['psg_scores']))) payload = json.dumps(res) print(payload, file=output)
candidates.add(line) logger.info("Loaded %d candidates." % len(candidates)) else: candidates = None logger.info("Initializing pipeline...") DrQA = pipeline.DrQA( reader_model=args.reader_model, fixed_candidates=candidates, embedding_file=args.embedding_file, tokenizer=args.tokenizer, batch_size=args.batch_size, cuda=args.cuda, data_parallel=args.parallel, ranker_config={ "options": { "tfidf_path": args.retriever_model, "strict": False } }, db_config={"options": { "db_path": args.doc_db }}, num_workers=args.num_workers, ) # ------------------------------------------------------------------------------ # Read in dataset and make predictions # ------------------------------------------------------------------------------ logger.info("Loading queries from %s" % args.dataset)
import bottle from drqa import pipeline import json import pandas as pd app = bottle.Bottle() query = [] response = "" DrQA = pipeline.DrQA( cuda=False, reader_model="/ml/mfe4ml/raghuvan/nlp/code/DrQA/data/reader/single.mdl", ranker_config={'options': {'tfidf_path': "/ml/mfe4ml/raghuvan/nlp/code/DrQA/data/datasets/helpbot/mpp/mpp-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"}}, db_config={'options': {'db_path': "/ml/mfe4ml/raghuvan/nlp/code/DrQA/data/datasets/helpbot/mpp/mpp.db"}} ) @app.get("/") def home(): with open('/ml/mfe4ml/raghuvan/nlp/code/DrQA/scripts/pipeline/demo.html', 'r') as fl: html = fl.read() return html @app.post('/answer') def answer(): question = bottle.request.json['question'] print("received question: {}".format(question)) global query, response predictions = DrQA.process( question, candidates=None, top_n=2, n_docs=5, return_context=True ) dfr = pd.DataFrame(predictions) print("[info] RESULTS DF: ")
import code from drqa import retriever from drqa import pipeline from drqa.retriever import utils database_path = './rough/DrQA/data/wikipedia/' conn = sqlite3.connect(database_path + 'docs.db') cursor = conn.cursor() print("setting up DrQA") DrQA = pipeline.DrQA( cuda=True, fixed_candidates=None, reader_model=None, ranker_config={ 'options': { 'tfidf_path': '/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz' } }, db_config={'options': { 'db_path': database_path + 'docs.db' }}, tokenizer=None) #ranker = retriever.get_class('tfidf')(tfidf_path='/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz') print('ranker loaded') def get_docs(query, k=1): doc_id, doc_score = ranker.closest_docs(query, k) for i in range(len(doc_score)): print(str(doc_id[i]) + ' ' + str(doc_score)) return doc_id
logger.info('Running on CPU only.') if args.ranker.lower().startswith('s'): ranker = retriever.get_class('sql')(db_path=args.db_path) elif args.ranker.lower().startswith('l'): ranker = retriever.get_class('lucene')(index_path=args.db_path) else: ranker = retriever.get_class('tfidf')(tfidf_path=args.retriever_model, db_path=args.db_path) logger.info('Initializing pipeline...') DrQA = pipeline.DrQA(reader_model=args.reader_model, normalize=args.normalize, tokenizer=args.tokenizer, batch_size=args.batch_size, cuda=args.cuda, data_parallel=args.parallel, ranker=ranker, num_workers=args.num_workers, et_model=args.et_model, et_threshold=args.et_threshold) # ------------------------------------------------------------------------------ # Read in dataset and make predictions # ------------------------------------------------------------------------------ logger.info('Loading queries from %s' % args.dataset) queries = [] for line in open(args.dataset): data = json.loads(line) queries.append(data['question'])
default=None, help='Path to Document DB or index') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: torch.cuda.set_device(args.gpu) logger.info('CUDA enabled (GPU %d)' % args.gpu) else: logger.info('Running on CPU only.') logger.info('Initializing pipeline...') DrQA = pipeline.DrQA(cuda=args.cuda, reader_model=args.reader_model, normalize=args.normalize, ranker=LuceneRanker, tokenizer=args.tokenizer) # ------------------------------------------------------------------------------ # Drop in to interactive mode # ------------------------------------------------------------------------------ def process(question, top_n=1, n_docs=5): predictions = DrQA.process_single(question, top_n, n_docs, return_context=True) table = prettytable.PrettyTable( ['Rank', 'Answer', 'Doc', 'Answer Score', 'Doc Score'])
def process(self, question, candidates=None, top_n=3, n_docs=10): print('retriever_model_arg:',self.retriever_model_arg) #Start Modification 09/03/2018 #Set a environnement variable import drqa.tokenizers drqa.tokenizers.set_default('corenlp_classpath', '/home/ubuntu/spacework/DrQA/data/corenlp/*') # end modification logger = logging.getLogger() logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) if self.retriever_model_arg is None: self.retriever_model_arg = '/home/ubuntu/spacework/DrQA/data/gdpr/gdpr_all_en_articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz' if self.doc_db_arg is None: self.doc_db_arg = '/home/ubuntu/spacework/DrQA/data/gdpr/gdpr_all_en_articles.db' # Comment the arguments # parser = argparse.ArgumentParser() # parser.add_argument('--reader-model', type=str, default=None, # help='Path to trained Document Reader model') # parser.add_argument('--retriever-model', type=str, default=retriever_model, # help='Path to Document Retriever model (tfidf)') # parser.add_argument('--doc-db', type=str, default=doc_db, # help='Path to Document DB') # parser.add_argument('--tokenizer', type=str, default=None, # help=("String option specifying tokenizer type to " # "use (e.g. 'corenlp')")) # parser.add_argument('--candidate-file', type=str, default=None, # help=("List of candidates to restrict predictions to, " # "one candidate per line")) # parser.add_argument('--no-cuda', action='store_true', # help="Use CPU only") # parser.add_argument('--gpu', type=int, default=-1, # help="Specify GPU device id to use") # args = parser.parse_args() #end comment arguments # Modification 09/03/2018 # changa the args cuda_arg = not self.no_cuda_arg and torch.cuda.is_available() if cuda_arg: torch.cuda.set_device(self.gpu_arg) logger.info('CUDA enabled (GPU %d)' % self.gpu_arg) else: logger.info('Running on CPU only.') if self.candidate_file_arg: logger.info('Loading candidates from %s' % self.candidate_file_arg) candidates = set() with open(self.candidate_file_arg) as f: for line in f: line = utils.normalize(line.strip()).lower() candidates.add(line) logger.info('Loaded %d candidates.' % len(candidates)) else: candidates = None print('DrQA:',self.DrQA) if self.DrQA is None: logger.info('Initializing pipeline...') self.DrQA = pipeline.DrQA( cuda=cuda_arg, fixed_candidates=candidates, reader_model=self.reader_model_arg, ranker_config={'options': {'tfidf_path': self.retriever_model_arg}}, db_config={'options': {'db_path': self.doc_db_arg}}, tokenizer=self.tokenizer_arg ) predictions = self.DrQA.process(question, candidates, top_n, n_docs, return_context=True) table = prettytable.PrettyTable(['Rank', 'Answer', 'Doc', 'Answer Score', 'Doc Score']) dico_result_list = [] for i, p in enumerate(predictions, 1): table.add_row([i, p['span'], p['doc_id'], '%.5g' % p['span_score'], '%.5g' % p['doc_score']]) dico_result = {} dico_result['answer'] = p['span'] dico_result['docid'] = p['doc_id'] dico_result['docscore'] = p['span_score'] dico_result['answerscore'] = p['doc_score'] text = p['context']['text'] start = p['context']['start'] end = p['context']['end'] output = (text[:start] + colored(text[start: end], 'green', attrs=['bold']) + text[end:]) dico_result['doc'] = output dico_result_list.append(dico_result) print('Top Predictions:') print(table) print('\nContexts:') for p in predictions: text = p['context']['text'] start = p['context']['start'] end = p['context']['end'] output = (text[:start] + colored(text[start: end], 'green', attrs=['bold']) + text[end:]) print('[ Doc = %s ]' % p['doc_id']) print(output + '\n') for dico in dico_result_list: print(dico) return dico_result_list # banner = """ # Interactive DrQA # >> process(question, candidates=None, top_n=1, n_docs=5) # >> usage() # """ # def usage(): # print(banner) # code.interact(banner=banner, local=locals())
import prettytable import logging import os from termcolor import colored from drqa import pipeline from drqa.retriever import utils print("import done!") os.system("export CLASSPATH=$CLASSPATH:/home/shellphish/DrQA/data/corenlp/*") #logger.info('Initializing pipeline...') DrQA = pipeline.DrQA(cuda=None, fixed_candidates=None, reader_model=None, ranker_config={'options': { 'tfidf_path': None }}, db_config={'options': { 'db_path': None }}, tokenizer=None) print("Pipeline ready") # ------------------------------------------------------------------------------ # Drop in to interactive mode # ------------------------------------------------------------------------------ def process(question, candidates=None, top_n=1, n_docs=10): print("Processing") predictions = DrQA.process(question,
candidates = set() with open(args.candidate_file) as f: for line in f: line = utils.normalize(line.strip()).lower() candidates.add(line) logger.info('Loaded %d candidates.' % len(candidates)) else: candidates = None logger.info('Initializing pipeline...') DrQA = pipeline.DrQA( cuda=args.cuda, fixed_candidates=candidates, reader_model=args.reader_model, ranker_config={'options': {'tfidf_path': args.retriever_model}}, db_config={'options': {'db_path': args.doc_db}}, tokenizer=args.tokenizer, num_workers=1, max_loaders=1, embedding_file='data/vector/zh200.vec' ) # ------------------------------------------------------------------------------ # Drop in to interactive mode # ------------------------------------------------------------------------------ def process(question, candidates=None, top_n=1, n_docs=5): predictions = DrQA.process( question, candidates, top_n, n_docs, return_context=True
#print(output + '\n') return answers # # # # # FUNCOES # # # # # # Instanciando DrQA com a base de dados e modelo Wikipedia #drqaDir = '../DrQA' #reader_model = drqaDir + '/data/reader/multitask.mdl' #retriever_model = drqaDir + '/data/wikipedia/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz' #doc_db = drqaDir + '/data/wikipedia/docs.db' #tokenizer = 'corenlp' # Carregando modelo e base Wikipedia if os.environ.get("WERKZEUG_RUN_MAIN") == "true": print('Carregando modelo de QA e base Wikipedia/2016...', end = '') DrQA = pipeline.DrQA( cuda = torch.cuda.is_available() ) # # DrQA = pipeline.DrQA( # cuda = torch.cuda.is_available(), # Disponibilidade do CUDA (proc. paralelo) # fixed_candidates = None, # reader_model = reader_model, # ranker_config = {'options': {'tfidf_path': retriever_model}}, # db_config = {'options': {'db_path': doc_db}}, # tokenizer = tokenizer # ) print(' Ok!') app = Flask(__name__) # for CORS @app.after_request