Exemplo n.º 1
0
    def search_metaindex_by_keyword(self, text, limit=None, timelimit=1):
        """
            Performs a query in the metadata search index by the 'key' field.
            Arguments:
                text: String used to perform the search in the index.
                limit: Maximum number of results to be returned. By default there is no limit.
                timelimit: Maximum number of seconds to execute the search. Searches that
                           take longer than timelimit will return only partial results.
            Returns:
                A list of dictionaries, each containing the fields in the metadata
                index, whose values match the query text in the 'key' field.
        """
        results_list = []
        if self.metaindex:
            with self.metaindex.searcher() as searcher:
                query = QueryParser('key', self.metaindex.schema).parse(text)
                coll = searcher.collector(limit)
                tlc = TimeLimitCollector(coll, timelimit, use_alarm=False)

                # Try searching
                try:
                    searcher.search_with_collector(query, tlc)
                except TimeLimit:
                    print(
                        "searchByKeyWord: Index search took too long, aborting!"
                    )

                # get partial results, if available
                results = tlc.results()
                for res in results:
                    results_list.append(dict(res))

        return results_list
Exemplo n.º 2
0
 def doSearch(self, text):
     q = self.qp.parse(text)  # build query
     with self.ix.searcher(
             weighting=scoring.Frequency) as s:  # simple scorer may help
         c = s.collector(limit=self.MaxResults)
         c = TimeLimitCollector(c, 0.5)
         try:
             s.search_with_collector(q, c)
         except:
             print("TIMEOUT!")
         results = c.results()  # partial results if hung
         self.searchResults.clear()
         #my_cf = highlight.PinpointFragmenter(maxchars=100, surround=60)
         my_cf = highlight.ContextFragmenter(maxchars=160, surround=30)
         #my_cf = highlight.SentenceFragmenter(maxchars=200, sentencechars='\n')
         results.fragmenter = my_cf
         if len(results) > 0:
             for res in results:
                 res.fragmenter = my_cf
                 # self.searchResults.append(res.highlights('Text',top=1) + '*--*\n' + res['MeetingLink']+ '\n')
                 self.searchResults.append(res.highlights('Text', top=1))
                 self.searchResults.append('-Link to Meeting -')
                 self.searchResults.append(res['MeetingLink'] + '\n')
                 self.searchResults.append('----------')
                 self.searchResults.append('----------')
         cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)
Exemplo n.º 3
0
    def full_search(self, query, time_limit=-1, search_limit=50,
                    edit_dist=0):
        val = {}

        try:
            searcher = self._index.searcher(weighting=scoring.TF_IDF())
            if time_limit > 0:
                c = searcher.collector(limit=search_limit)
                tlc = TimeLimitCollector(c, timelimit=time_limit)
                try:
                    searcher.search_with_collector(query, tlc)
                except TimeLimit:
                    None
                try:
                    res = tlc.results()
                except TimeLimit:
                    res = []
            else:
                res = searcher.search(query, limit=search_limit)

            for ii in res:
                val[ii['title']] = (ii.docnum, self.scale(ii.score))
        finally:
            searcher.close()
        return val
Exemplo n.º 4
0
 def doSearch(self, text):
     q = self.qp.parse(text)          # build query with event-provided search key
     with self.ix.searcher(weighting = scoring.BM25F) as s:    # there are several NLP style scorers for Whoosh
         c = s.collector(limit=self.MaxResults)                # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long...
         c = TimeLimitCollector(c,0.5)               
         try:
             s.search_with_collector(q,c)
         except:
             print("TIMEOUT!")                       # DEBUG out put to console if we're timing out a lot  
         results = c.results()                       # If we do get a timeout, still return whatever we've got, i.e. partial results 
                                                     #-----------------------------------------------------
         self.searchResults.clear()                  # ** Now format the results for display ** 
         results.fragmenter = WholeFragmenter()      # we want the full technical name not just the local context.
         self.MaudeResults.clear()                  # Clear
         if len(results)> 0:
             self.results = [] 
             for res in results:
                 self.results.append(res['msid'])
                 HighLightedMsid = res.highlights('msid')  # construct MSID string with highlights, if that's where the match is... 
                 if len(HighLightedMsid) >0:
                     msid_str = HighLightedMsid
                 else:
                     msid_str = res['msid']
                 HighLightedTechName = res.highlights('technical_name')  # construct technical_name string with highlights, if relevant
                 if len(HighLightedTechName) >0:
                     tech_str = HighLightedTechName
                 else:
                     tech_str = res['technical_name']
                 self.searchResults.append(msid_str + ' - ' + tech_str)
         cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)     # return cursor to beginning of search results     
Exemplo n.º 5
0
Arquivo: ir.py Projeto: BinbinBian/qb
    def full_search(self, query, time_limit=-1, search_limit=50,
                    edit_dist=0):
        val = {}

        try:
            searcher = self._index.searcher(weighting=scoring.TF_IDF())
            if time_limit > 0:
                c = searcher.collector(limit=search_limit)
                tlc = TimeLimitCollector(c, timelimit=time_limit)
                try:
                    searcher.search_with_collector(query, tlc)
                except TimeLimit:
                    None
                try:
                    res = tlc.results()
                except TimeLimit:
                    res = []
            else:
                res = searcher.search(query, limit=search_limit)

            for ii in res:
                val[ii['title']] = (ii.docnum, self.scale(ii.score))
        finally:
            searcher.close()
        return val
Exemplo n.º 6
0
    def search(self, text: str, limit: int, timelimit=3.0):
        with self.index.searcher() as searcher:
            or_group = OrGroup.factory(.9)
            parser = MultifieldParser(['content', 'quiz_bowl'],
                                      schema=self.schema,
                                      group=or_group)
            text_query = parser.parse(text)
            collector = searcher.collector(limit=limit)
            tlc = TimeLimitCollector(collector, timelimit=timelimit)
            partial = True
            try:
                searcher.search_with_collector(text_query, tlc)
                partial = False
            except searching.TimeLimit:
                pass

            # There is a bug in whoosh that makes calling len directory or indirectly fail
            # which is why we don't use list()
            results = [(r['page'], r.score) for r in tlc.results()]

            # Doing logging using partial instead of directory is required due to a mysterious race
            # condition between whoosh time limits and log.info. Its important that all of whoosh's
            # functions including search_with_collector() and tlc.results() are called before
            # logging anything
            if partial:
                log.info(
                    'Search took longer than {}s, getting partial results'.
                    format(timelimit))

            if len(results) == 0:
                return [('<UNK_ANSWER>', 0)]

            return results
Exemplo n.º 7
0
 def search_for_track(self, querystring):
     if len(querystring) >= 3:
         with self.ix.searcher() as searcher:
             collector = searcher.collector(limit=20)
             tlc = TimeLimitCollector(collector, timelimit=1.4, use_alarm=False)
             parser = MultifieldParser(["artist", "album", "title"], self.ix.schema)
             parser.add_plugin(qparser.FuzzyTermPlugin())
             myquery = parser.parse(querystring)
             try:
                 searcher.search_with_collector(myquery, tlc)
                 if len(tlc.results()) == 0:
                     myquery = parser.parse(" ".join(word + "~2" for word in querystring.split()))
                     searcher.search_with_collector(myquery, tlc)
             except TimeLimit:
                 logging.info("Time Limit for query reached!")
             logging.debug("czas zapytania: ", collector.runtime)
             ret = [self.__tracks[int(result["id"])] for result in tlc.results()]
             return ret
     else:
         return []
Exemplo n.º 8
0
def test_media():
    index_path = os.path.join(config.index_root_dir, 'media')
    storage = FileStorage(index_path)
    ix = storage.open_index()
    with ix.searcher() as searcher:
        #print list(searcher.lexicon('title'))
        myquery = Term('title', u'尾巴')
        #myquery = Term('movieid', u'mi1022160')

        tc = searcher.collector(limit=200)
        tlc = TimeLimitCollector(tc, timelimit=1)  #limit seacher time
        searcher.search_with_collector(myquery, tlc)
        for hit in tlc.results():
            #print hit.fields()
            print hit.fields()
Exemplo n.º 9
0
class WhooshWrap():
    '''
        Wrapper class to make Whoosh API a little simpler
        Initialize by pointing to an existing Whoosh index and specifying searchable fields, Max Results and Timeout
        Query by running self.doSearch, providing query string, and timeout
        Results of the last search are stored in the object as Whoosh results object (requires open index to access) and returned as a traditional python dictionary
    '''
    def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5):
        ''' Initializes the wrapper object with ijdex reference and preferences
            parameter MSID_index_dir        = (string) Existing Whoosh Index directory
            parameter Searchable            = (string) List of fieldnames of the index to search
            parameter MaxResults       = (numeric) Maximum # of results to return
            parameter Timeout       = (numeric) Maximum # of seconds to wait before ending search
        
        '''
        self.ix = index.open_dir(MSID_index_dir)                         #  
        self.qp = MultifieldParser(Searchable, schema=self.ix.schema)    # Search all the specified fields
        #self.qp =  QueryParser(Searchable[0], schema=self.ix.schema)    # Search ONLY the first field
        #self.s = self.ix.searcher(weighting = scoring.Frequency)        # Simple Scorer
        self.s = self.ix.searcher(weighting = scoring.BM25F)         # Fancy Scorer
        c = self.s.collector(limit=MaxResults)                # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long...
        self.c = TimeLimitCollector(c,Timeout)               
        self.Searchable = Searchable
        self.LastResults = None
        
    def doSearch(self,qstring,ReturnFields):
        ''' Performs a search on the index with the provided query and returns a Dict of results
            parameter qstring       = (string) Search key
            parameter ReturnFields  = (list of strings) List of fieldnames to include in return results.  NOTE, may be different than Searchable, but fields must exist in index
            returnval result_fields = dict of result strings : lists per field, i.e. 
                                    = result_dict = {'Return Fields 1' : [ list of result strings ], 'Return Fields 2' : [ list of result strings ]....}
        '''
        q = self.qp.parse(qstring)          # build query with event-provided search key        
        try:
            self.s.search_with_collector(q,self.c)
        except:
            print("TIMEOUT!")                       # DEBUG out put to console if we're timing out a lot  
        results = self.c.results()                       # If we do get a timeout, still return whatever we've got, i.e. partial results    
        self.LastResults = results                  #
        ResultsDict ={}
        for field in ReturnFields:
            ResultsDict[field] = []
            for res in results:
                ResultsDict[field].append(res[field]) # should check that field is in results
        return ResultsDict
            
Exemplo n.º 10
0
Arquivo: qdb.py Projeto: BinbinBian/qb
    def find_closest(self, raw_query, threshold=50):
        """
        Returns the best score of similarity
        """
        from whoosh import qparser
        from whoosh.qparser import QueryParser
        from fuzzywuzzy import fuzz
        from extractors.ir import IrIndex

        if self.parser is None:
            og = qparser.OrGroup.factory(0.9)
            self.parser = QueryParser("text", schema=self.schema, group=og)

        query_text, query_len = IrIndex.prepare_query(raw_query.lower())
        print("Query: %s" % query_text)
        query = self.parser.parse(query_text)
        print("-------------")
        closest_question = -1
        with self.index.searcher() as s:
            c = s.collector(limit=10)
            tlc = TimeLimitCollector(c, timelimit=5)
            try:
                s.search_with_collector(query, tlc)
            except TimeLimit:
                None
            try:
                results = tlc.results()
            except TimeLimit:
                print("Time limit reached!")
                return -1

            print(results[0]['id'], self.raw[results[0]['id']][:50])
            similarity = fuzz.ratio(self.raw[results[0]['id']],
                                    raw_query.lower())
            if similarity > threshold:
                closest_question = results[0]['id']
                print("Old!", closest_question, similarity)
            else:
                print("NEW!  %f" % similarity)
        print("-------------")
        return closest_question
Exemplo n.º 11
0
	def searchIndex(self, sq):
		indexParser = MultifieldParser(["query", "target"], schema=self.schema).parse(unicode(sq))
		with self.ix.searcher() as s:
			collector = s.collector(limit=None)
			timed_collector = TimeLimitCollector(collector, timelimit=30.0)
			
			try:
				results = s.search_with_collector(indexParser, timed_collector)
			except TimeLimit:
				print 'Search ime limit of 30 seconds exceeded.'
			
			hits = timed_collector.results()
			
			# Convert result structure into a jsonable list
			# TODO: improve this structure
			matches = []
			for i in hits:
				matches.append({"sourcelang": i["query"],
								"targetlang": i["target"],
								"distance": (1.0/i.score)})
			return matches
Exemplo n.º 12
0
Arquivo: qdb.py Projeto: jankim/qb
    def find_closest(self, raw_query, threshold=50):
        """
        Returns the best score of similarity
        """
        from whoosh import qparser
        from whoosh.qparser import QueryParser
        from fuzzywuzzy import fuzz
        from extractors.ir import IrIndex

        if self.parser is None:
            og = qparser.OrGroup.factory(0.9)
            self.parser = QueryParser("text", schema=self.schema, group=og)

        query_text, query_len = IrIndex.prepare_query(raw_query.lower())
        print ("Query: %s" % query_text)
        query = self.parser.parse(query_text)
        print ("-------------")
        closest_question = -1
        with self.index.searcher() as s:
            c = s.collector(limit=10)
            tlc = TimeLimitCollector(c, timelimit=5)
            try:
                s.search_with_collector(query, tlc)
            except TimeLimit:
                None
            try:
                results = tlc.results()
            except TimeLimit:
                print ("Time limit reached!")
                return -1

            print (results[0]["id"], self.raw[results[0]["id"]][:50])
            similarity = fuzz.ratio(self.raw[results[0]["id"]], raw_query.lower())
            if similarity > threshold:
                closest_question = results[0]["id"]
                print ("Old!", closest_question, similarity)
            else:
                print ("NEW!  %f" % similarity)
        print ("-------------")
        return closest_question
Exemplo n.º 13
0
def cal_sim(train_data_path,
            test_data_path,
            dst_result_path=None,
            save_n_best_search=1):
    schema = Schema(context=TEXT(stored=True),
                    response=STORED,
                    post=TEXT(stored=True))
    index_i = re.findall('\d', train_data_path)[0]

    index_path = "../tmp/ix_index/" + index_i
    if not os.path.exists(index_path):
        os.makedirs(index_path)

    ix = create_in(index_path, schema)
    writer = ix.writer()

    def get_cpr(line):
        lines = line.lower().strip().split('\t')
        context = ''
        post = lines[0]
        response = lines[1]
        return context.strip().decode('utf-8'), response.decode(
            'utf-8'), post.decode('utf-8')

    def load_train_data(file_name, writer):
        f = open(file_name)
        for line in f:
            context, response, post = get_cpr(line)
            if context != '':
                writer.add_document(context=context,
                                    response=response,
                                    post=post)
            else:
                writer.add_document(response=response, post=post)
        writer.commit()

    def get_query(line, ix):
        lines = line.strip().split('\t')
        post = lines[0].decode('utf-8')
        q2 = QueryParser("post", ix.schema).parse(post)
        terms = list(q2.all_terms())
        query = Or([Term(*x) for x in terms])
        return query

    load_train_data(train_data_path, writer)

    f = open(test_data_path, 'r')
    fw_search = open(dst_result_path, 'w')
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        c = searcher.collector(limit=10)
        tlc = TimeLimitCollector(c, timelimit=10.0)
        for line in f:
            try:
                query = get_query(line, ix)
                searcher.search_with_collector(query, tlc)
                results = tlc.results()
                for i in range(min(len(results), save_n_best_search)):
                    fw_search.write(line.strip() + '\t' +
                                    str(results[i]["post"]) + '\t' +
                                    str(results[i]["response"]) + '\n')
            except Exception as e:
                print('TimeLimit, ignore it!')
                print(line)
    fw_search.close()
Exemplo n.º 14
0
        # Get a collector object

        print("Finished loading searcher")
        for i, k in zip(range(0, len(fullSentence)),
                        tqdm(range(len(overlaps)))):
            c = searcher.collector(limit=50, terms=True)
            # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds
            tlc = TimeLimitCollector(c, timelimit=120.0)

            # Try searching

            try:
                searcher.search_with_collector(query[i], tlc)
            except TimeLimit:
                print("Search took too long, aborting!")
            results = tlc.results()

            #results = searcher.search(query, terms=True,limit=10)

            #results= searcher.search(query,limit=10)
            if results.scored_length() > 0:
                overlapCount += 1
                for j in range(0, results.scored_length()):
                    if j == 0:
                        print(fullSentence[i])
                        print(sentenceToBeParsed[i])
                        print("Top evidence sentence:")
                        print(results[j]['content'])
                        print(results[j].score)
                        numberOfOverlaps = len(results[j].matched_terms())
                        numberOfComponents = len(query[i].all_terms())
Exemplo n.º 15
0
def home(request):
    title = "Search text"
    form = SearchForm(request.POST or None)

    context = {
        "title": title,
        "form": form
    }

    if form.is_valid():
        instance = form.save(commit=False)
        instance.save()

        message = "You will get search results for: %s via %s soon" % (instance.searching_text, instance.email)
        context = {
            "title": "Thank you",
            "message": message,
        }

        with ix.searcher() as searcher:
            query = QueryParser("text", ix.schema).parse(instance.searching_text)
            # Get a collector object
            c = searcher.collector(limit=None)
            # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds
            tlc = TimeLimitCollector(c, timelimit=instance.t_limit)
            # Try searching
            try:
                searcher.search_with_collector(query, tlc)
            except TimeLimit:
                pass
            # You can still get partial results from the collector
            results = tlc.results()
            lst = []
            for i in range(0, len(results)):
                st = ''
                st += 'Book: '
                st += results[i]["book"]
                st += ', chapter: '
                st += results[i]["chapter"]
                st += ', page: '
                st += str(results[i]["page"])
                lst.append(st)



        # with ix.searcher() as searcher:
        #     query = QueryParser("text", ix.schema).parse(instance.searching_text)
        #     results = searcher.search(query)
        #     lst = []
        #     for i in range(0, len(results)):
        #         st = ''
        #         st += 'Book: '
        #         st += results[i]["book"]
        #         st += ', chapter: '
        #         st += results[i]["chapter"]
        #         st += ', page: '
        #         st += str(results[i]["page"])
        #         lst.append(st)

        logging.basicConfig(format=u'%(levelname)-8s [%(asctime)s] %(message)s', level=logging.DEBUG,
                            filename=u'mylog.log')
        time_diff = datetime.datetime.now(timezone.utc) - instance.timestamp
        logging.info(time_diff.total_seconds())

        subject = 'Search results for: ' + form.cleaned_data.get('searching_text')
        message = 'Search results for: ' + form.cleaned_data.get('searching_text') + '\n'
        for i in range(0, len(lst)):
            message += str(i+1)+') '
            message += lst[i]
            message += '\n'
        from_email = settings.EMAIL_HOST_USER
        to_email = form.cleaned_data.get('email')
        send_mail(subject,
                  message,
                  from_email,
                  [to_email],
                  fail_silently=True)
    return render(request, "home.html", context)
Exemplo n.º 16
0
def home(request):
    title = "Search text"
    form = SearchForm(request.POST or None)

    context = {"title": title, "form": form}

    if form.is_valid():
        instance = form.save(commit=False)
        instance.save()

        message = "You will get search results for: %s via %s soon" % (
            instance.searching_text, instance.email)
        context = {
            "title": "Thank you",
            "message": message,
        }

        with ix.searcher() as searcher:
            query = QueryParser("text",
                                ix.schema).parse(instance.searching_text)
            # Get a collector object
            c = searcher.collector(limit=None)
            # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds
            tlc = TimeLimitCollector(c, timelimit=instance.t_limit)
            # Try searching
            try:
                searcher.search_with_collector(query, tlc)
            except TimeLimit:
                pass
            # You can still get partial results from the collector
            results = tlc.results()
            lst = []
            for i in range(0, len(results)):
                st = ''
                st += 'Book: '
                st += results[i]["book"]
                st += ', chapter: '
                st += results[i]["chapter"]
                st += ', page: '
                st += str(results[i]["page"])
                lst.append(st)

        # with ix.searcher() as searcher:
        #     query = QueryParser("text", ix.schema).parse(instance.searching_text)
        #     results = searcher.search(query)
        #     lst = []
        #     for i in range(0, len(results)):
        #         st = ''
        #         st += 'Book: '
        #         st += results[i]["book"]
        #         st += ', chapter: '
        #         st += results[i]["chapter"]
        #         st += ', page: '
        #         st += str(results[i]["page"])
        #         lst.append(st)

        logging.basicConfig(
            format=u'%(levelname)-8s [%(asctime)s] %(message)s',
            level=logging.DEBUG,
            filename=u'mylog.log')
        time_diff = datetime.datetime.now(timezone.utc) - instance.timestamp
        logging.info(time_diff.total_seconds())

        subject = 'Search results for: ' + form.cleaned_data.get(
            'searching_text')
        message = 'Search results for: ' + form.cleaned_data.get(
            'searching_text') + '\n'
        for i in range(0, len(lst)):
            message += str(i + 1) + ') '
            message += lst[i]
            message += '\n'
        from_email = settings.EMAIL_HOST_USER
        to_email = form.cleaned_data.get('email')
        send_mail(subject, message, from_email, [to_email], fail_silently=True)
    return render(request, "home.html", context)