def __init__(self, fieldname): ''' Constructor ''' self.w_parser = SimpleParser(fieldname, None) self.w_parser.add_plugin(FieldsPlugin()) self.w_parser.add_plugin(OperatorsPlugin()) self.w_parser.add_plugin(PhrasePlugin()) self.w_parser.add_plugin(SingleQuotePlugin()) self.w_parser.add_plugin(GroupPlugin()) self.w_parser.add_plugin(PrefixPlugin()) self.w_parser.add_plugin(GtLtPlugin()) self.w_parser.add_plugin(RangePlugin()) self.query = None self.current_node_stack = []
def read(self, fieldnames, query, callback): r = None with self.ix.searcher() as searcher: start = time.time() query = SimpleParser(fieldnames, self._whoosh_schema).parse(query) end = time.time() print('query', ' took', str(end - start), 'time') callback(searcher.search(query))
def searchBodyAndHighlight(q): parser = SimpleParser("body", schema=ix.schema) q = parser.parse(q) terms = [text for fieldname, text in q.all_terms() if fieldname == "body"] r = s.search(q) analyzer = schema["body"].format.analyzer print "will tokenize with",q.all_terms fragmenter = highlight.ContextFragmenter(q.all_terms,400,80) # formatter = highlight.HtmlFormatter() formatter = colorIpythonFormatter for d in r: # The text argument to highlight is the stored text of the title text = d["body"] res= highlight.highlight(text, terms, analyzer,fragmenter, formatter) # print res.encode("latin-1","replace") print unicodedata.normalize('NFKC', res).encode("utf-8","replace") print "-"*8
def read(self, fieldnames, query, callback): start = time.time() ix = open_dir(self._need_base, self._need_index) end = time.time() print('opendir', ' took', str(end - start), 'time') r = None with ix.searcher() as searcher: start = time.time() #query = QueryParser(field, self._whoosh_schema).parse(query) query = SimpleParser(fieldnames, self._whoosh_schema).parse(query) end = time.time() print('query', ' took', str(end - start), 'time') callback(searcher.search(query))
def search(q): s = ix.searcher() parser = SimpleParser("body", schema=ix.schema) q = parser.parse(q) terms = [text for fieldname, text in q.all_terms() if fieldname == "body"] r = s.search(q) analyzer = schema["body"].format.analyzer # fragmenter = highlight.ContextFragmenter(q.all_terms,500,40) fragmenter=highlight.SentenceFragmenter() #just extract sentences for the first one search_results=[] for d in r: # The text argument to highlight is the stored text of the title text = d["body"] path=d["path"] for ex in highlight_extracts(path,text,terms): search_results.append((path,ex[0],ex[1],ex[2],ex[3])) search_results.sort(key=lambda x:x[1],reverse=True) return search_results[:20],terms
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \ only_title_flag = 0, \ directory_containing_the_index = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \ query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \ gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \ doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \ conf_label = "Not Specified", mrr_eps = .32, \ k_interval_for_nDCG = range(1,151)): ### ### Create a Schema ### schema = Schema(id=ID(stored=True), \ title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer)) ### ### Create an empty-Index ### according to the just defined Schema ;) ### ix = create_in(directory_containing_the_index, schema) ### ### Get the query set (reset index due to missing values in the IDs) ### query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index() ### ### Get the ground truth (little manipulation to group by query and allign IDs) ### gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t") gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict() gt = defaultdict(list) j = 1 for i in range(len(gt_tmp)): while(gt[i] == []): try: gt[i] = gt_tmp[j] j+=1 except KeyError: j += 1 number_of_queries = len(query_set) num_of_docs = 1400 ### ### We'll iterate on the following lists to swicth SE scoring function and get their names ### scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()] scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list] ### ### Fill the Index ### writer = ix.writer() for doc in range(num_of_docs): id_ = str(doc+1) title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html") writer.add_document(id=id_, title = title, content = content) writer.commit() ### ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config ### results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)]) evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs ### ### Run the SEs ### for idx_s,scorer in enumerate(scoring_functions_list): for idx,query in enumerate(query_set["Query"]): input_query = query ### ### Select a Scoring-Function ### scoring_function = scorer ### ### Create a QueryParser for ### parsing the input_query based on user SE choosen configuration. ### if multifield_flag: qp = MultifieldParser(["title","content"], ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: if only_title_flag: qp = SimpleParser("title", ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: qp = SimpleParser("content", ix.schema) parsed_query = qp.parse(input_query)# parsing the query ### ### Create a Searcher for the Index ### with the selected Scoring-Function ### searcher = ix.searcher(weighting=scoring_function) ### ### Perform a Search and store results ### results = searcher.search(parsed_query, limit=max_res) results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results] searcher.close() mrr_res = mrr(results_mat[:,:,idx_s],gt) if mrr_res >= mrr_eps: ### ### Compute and summarize R-precision distro ### r_res = r_precision(results_mat[:,:,idx_s],gt) mean = np.mean(list(r_res.values())) first_q = np.percentile(list(r_res.values()),25) third_q = np.percentile(list(r_res.values()),75) median = np.median(list(r_res.values())) minr = min(list(r_res.values())) maxr = max(list(r_res.values())) evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr] ### ### Compute nDCG@k for varying k and for each scoring function ### for k in k_interval_for_nDCG: tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values())) ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res) else: evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res] ### ### Just to see what's happening ### print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res)) return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs
def get_parser(self): return SimpleParser('name', schema=OffersSchema())
class QueryParser(object): def __init__(self, fieldname): ''' Constructor ''' self.w_parser = SimpleParser(fieldname, None) self.w_parser.add_plugin(FieldsPlugin()) self.w_parser.add_plugin(OperatorsPlugin()) self.w_parser.add_plugin(PhrasePlugin()) self.w_parser.add_plugin(SingleQuotePlugin()) self.w_parser.add_plugin(GroupPlugin()) self.w_parser.add_plugin(PrefixPlugin()) self.w_parser.add_plugin(GtLtPlugin()) self.w_parser.add_plugin(RangePlugin()) self.query = None self.current_node_stack = [] def parse(self, query): self.query = SQ() self.current_node_stack = [(self.query, HAYSTACK_DEFAULT_OPERATOR)] wquery = self.w_parser.parse(query) self.visit(wquery) if len(self.query) == 1 and isinstance(self.query.children[0], SQ): return self.query.children[0] else: return self.query def visit(self, q): if isinstance(q, Term): current_node, current_connector = self.current_node_stack.pop() current_node.add(SQ(**{q.fieldname: q.text}), current_connector) self.current_node_stack.append((current_node, current_connector)) elif isinstance(q, And): self._add_compound_query(q, SQ.AND) elif isinstance(q, AndMaybe): self._add_andmaybe(q) elif isinstance(q, Or): self._add_compound_query(q, SQ.OR) elif isinstance(q, AndNot): self._add_andnot(q) elif isinstance(q, Not): self._add_not(q) elif isinstance(q, Phrase): self._add_phrase(q) elif isinstance(q, Prefix): self._add_prefix(q) elif isinstance(q, TermRange): self._add_range(q) def _add_compound_query(self, q, connector): new_node = SQ() self.current_node_stack.append((new_node, connector)) for subquery in q.subqueries: self.visit(subquery) self.current_node_stack.pop() if len(new_node) == 1 and isinstance(new_node.children[0], SQ): new_node = new_node.children[0] current_node, current_connector = self.current_node_stack[-1] current_node.add(new_node, current_connector) def _add_andnot(self, q): new_node = SQ() self.current_node_stack.append((new_node, SQ.AND)) self.visit(q.a) self.visit(Not(q.b)) self.current_node_stack.pop() if len(new_node) == 1 and isinstance(new_node.children[0], SQ): new_node = new_node.children[0] current_node, current_connector = self.current_node_stack[-1] current_node.add(new_node, current_connector) def _add_andmaybe(self, q): new_node = SQ() self.current_node_stack.append((new_node, SQ.AND)) self.visit(q.a) self.visit(q.b) self.current_node_stack.pop() if len(new_node) == 1 and isinstance(new_node.children[0], SQ): new_node = new_node.children[0] current_node, current_connector = self.current_node_stack[-1] current_node.add(new_node, current_connector) def _add_not(self, q): new_node = SQ() self.current_node_stack.append((new_node, SQ.AND)) self.visit(q.query) self.current_node_stack.pop() if len(new_node) == 1 and isinstance(new_node.children[0], SQ): new_node = new_node.children[0] current_node, current_connector = self.current_node_stack[-1] current_node.add(~new_node, current_connector) def _add_phrase(self, q): new_node = SQ(**{q.fieldname + "__exact": " ".join(q.words)}) current_node, current_connector = self.current_node_stack[-1] current_node.add(new_node, current_connector) def _add_prefix(self, q): new_node = SQ(**{q.fieldname + "__startswith": q.text}) current_node, current_connector = self.current_node_stack[-1] current_node.add(new_node, current_connector) def _add_range(self, q): if q.start is None: if q.endexcl: postfix = "__lt" else: postfix = "__lte" new_node = SQ(**{q.fieldname + postfix: self.__convert_nb(q.end)}) elif q.end is None: if q.startexcl: postfix = "__gt" else: postfix = "__gte" new_node = SQ( **{q.fieldname + postfix: self.__convert_nb(q.start)}) else: new_node = SQ( **{ q.fieldname + "__range": [self.__convert_nb(q.start), self.__convert_nb(q.end)] }) current_node, current_connector = self.current_node_stack[-1] current_node.add(new_node, current_connector) def __convert_nb(self, str_nb): try: res = int(str_nb) return res except ValueError: try: res = float(str_nb) return res except ValueError: return str_nb