Exemplo n.º 1
0
class WhooshSearchInterface(BaseSearchInterface):
    """
    A search interface making use of the Whoosh indexing library - and the ifind search components.

    Set model = 0 for TFIDIF
    Set model = 1 for BM25 (defaults to b=0.75), set pval to change b.
    Set model = 2 for PL2 (defaults to c=10.), set pval to change c.

    """
    def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None):
        super(WhooshSearchInterface, self).__init__()
        log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir))
        self.__index = open_dir(whoosh_index_dir)
        self.__reader = self.__index.reader()


        self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,model=model,implicit_or=implicit_or)

        if pval:
            self.__engine.set_model(model, pval)
    
    def issue_query(self, query, top=100):
        """
        Allows one to issue a query to the underlying search engine. Takes an ifind Query object.
        """
        query.top = top
        response = self.__engine.search(query)
        
        self._last_query = query
        self._last_response = response
        
        return response
    
    def get_document(self, document_id):
        """
        Retrieves a Document object for the given document specified by parameter document_id.
        """
        fields = self.__reader.stored_fields(int(document_id))
        
        title = fields['title']
        content = fields['content']
        document_num = fields['docid']
        document_date = fields['timedate']
        document_source = fields['source']
        
        document = Document(id=document_id, title=title, content=content)
        document.date = document_date
        document.doc_id = document_num
        document.source = document_source
        
        return document
Exemplo n.º 2
0
    def __init__(self, whoosh_index_dir, model=2, implicit_or=True, pval=None):
        super(WhooshSearchInterface, self).__init__()
        log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir))
        self.__index = open_dir(whoosh_index_dir)
        self.__reader = self.__index.reader()


        self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,model=model,implicit_or=implicit_or)

        if pval:
            self.__engine.set_model(model, pval)
Exemplo n.º 3
0
    def __init__(self,
                 whoosh_index_dir,
                 model=2,
                 implicit_or=True,
                 pval=None,
                 frag_type=2,
                 frag_size=2,
                 frag_surround=40,
                 host=None,
                 port=0):
        super(WhooshSearchInterface, self).__init__()
        log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir))
        self.__index = open_dir(whoosh_index_dir)
        self.__reader = self.__index.reader()
        self.__redis_conn = None

        if host is None:
            self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,
                                       model=model,
                                       implicit_or=implicit_or)
        else:
            self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,
                                       model=model,
                                       implicit_or=implicit_or,
                                       cache='engine',
                                       host=host,
                                       port=port)

        # Update (2017-05-02) for snippet fragment tweaking.
        # SIGIR Study (2017) uses frag_type==1 (2 doesn't give sensible results), surround==40, snippet_sizes==2,0,1,4
        self.__engine.snippet_size = frag_size
        self.__engine.set_fragmenter(frag_type=frag_type,
                                     surround=frag_surround)

        if pval:
            self.__engine.set_model(model, pval)
from ifind.common.language_model import LanguageModel
from compute_snippet_len_gain import make_query, get_words_from_snippet, compute_length, compute_info_gain
import sys
from ifind.search.engines.whooshtrec import Whooshtrec
from ifind.search import Query

bm25_search_engine = Whooshtrec(
    whoosh_index_dir='/Users/david/Workspace/indexes/aquaint_test500_whoosh/',
    stopwords_file='',
    model=1,
    newschema=True)

bm25_search_engine.snippet_size = 40


def main():
	log_file = sys.argv[1]
	lm = LanguageModel(file='vocab.in')
	
	# Interface...      1   2   3   4
	snippet_sizes    = [2,  0,  1,  4]
	snippet_surround = [40, 40, 40, 40]
	
	with open(log_file) as f:
		
		for s in f:
			fields = s.strip().split()
			amtid = fields[3]
			interface = fields[5]
			order = fields[6]
			topic = fields[7]
Exemplo n.º 5
0
class WhooshSearchInterface(BaseSearchInterface):
    """
    A search interface making use of the Whoosh indexing library - and the ifind search components.

    Set model = 0 for TFIDIF
    Set model = 1 for BM25 (defaults to b=0.75), set pval to change b.
    Set model = 2 for PL2 (defaults to c=10.), set pval to change c.
    """
    def __init__(self,
                 whoosh_index_dir,
                 model=2,
                 implicit_or=True,
                 pval=None,
                 frag_type=2,
                 frag_size=2,
                 frag_surround=40,
                 host=None,
                 port=0):
        super(WhooshSearchInterface, self).__init__()
        log.debug("Whoosh Index to open: {0}".format(whoosh_index_dir))
        self.__index = open_dir(whoosh_index_dir)
        self.__reader = self.__index.reader()
        self.__redis_conn = None

        if host is None:
            self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,
                                       model=model,
                                       implicit_or=implicit_or)
        else:
            self.__engine = Whooshtrec(whoosh_index_dir=whoosh_index_dir,
                                       model=model,
                                       implicit_or=implicit_or,
                                       cache='engine',
                                       host=host,
                                       port=port)

        # Update (2017-05-02) for snippet fragment tweaking.
        # SIGIR Study (2017) uses frag_type==1 (2 doesn't give sensible results), surround==40, snippet_sizes==2,0,1,4
        self.__engine.snippet_size = frag_size
        self.__engine.set_fragmenter(frag_type=frag_type,
                                     surround=frag_surround)

        if pval:
            self.__engine.set_model(model, pval)

    def issue_query(self, query, top=100):
        """
        Allows one to issue a query to the underlying search engine. Takes an ifind Query object.
        """
        query.top = top
        response = self.__engine.search(query)

        self._last_query = query
        self._last_response = response
        return response

    def get_document(self, document_id):
        """
        Retrieves a Document object for the given document specified by parameter document_id.
        """
        fields = self.__reader.stored_fields(int(document_id))

        title = fields['title']
        content = fields['content']
        document_num = fields['docid']
        document_date = fields['timedate']
        document_source = fields['source']

        document = Document(id=document_id, title=title, content=content)
        document.date = document_date
        document.doc_id = document_num
        document.source = document_source

        return document
Exemplo n.º 6
0
def main():

    bm25 = Whooshtrec(
    whoosh_index_dir='fullindex/',
    stopwords_file='',
    model=1,
    newschema=True)


    query = Query('Sea Pirates')
    query.skip = 1
    query.top = 5

    bm25.snippet_size = 3


    response = bm25.search(query)
    i = 1
    for result in response.results:
        print i,len(result.summary)
        #print result.summary
        #print "--------------"
        soup = BeautifulSoup(result.summary,'html.parser')
        text = soup.getText()
        #print text


        print "--------------"

        n = extract_nouns(text)
        print set(n)
        print "--------------"

        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

        #print tokenized_sentences
        cat_sentences = []
        for ts in tokenized_sentences:
            for w in ts:
                cat_sentences.append(w)

        #print cat_sentences

        tagged =  nltk.pos_tag(cat_sentences)
        nouns = [word for word,pos in tagged if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
        downcased = [x.lower() for x in nouns]
        joined = " ".join(downcased).encode('utf-8')
        into_string = str(nouns)
        print (into_string)

        #print tokenized_sentences

        print "--------------"
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.chunk.ne_chunk_sents(tagged_sentences, binary=True)
        entity_names = []
        for tree in chunked_sentences:
            # Print results per sentence
            # print extract_entity_names(tree)

            entity_names.extend(extract_entity_names(tree))

        print set(entity_names)

        i+=1
    'pretask/3/', 'search/3/',
    'pretask/4/', 'search/4/','endexperiment/',
    'logout/'
]

suggestion_trie = AutocompleteTrie(
    min_occurrences=3,
    suggestion_count=8,
    include_stopwords=False,
    stopwords_path=os.path.join(work_dir, "data/stopwords.txt"),
    vocab_path=os.path.join(work_dir, "data/vocab.txt"),
    vocab_trie_path=os.path.join(work_dir, "data/vocab_trie.dat"))

search_engine = Whooshtrec(
    whoosh_index_dir=my_whoosh_doc_index_dir,
    stopwords_file=stopword_file,
    model=1,
    newschema=True)

search_engine.key_name = 'bm25'
search_engine.set_fragmenter(frag_type=2, surround=30)

exp_test = ExperimentSetup(
    workflow=snippet_flow,
    engine=search_engine,
    practice_topic='367',
    topics=['347', '341', '435','408'],
    rpp=10,
    practice_interface=1,
    interface=[1, 2, 3, 4],
    rotation_type=1,
Exemplo n.º 8
0
import sys
from ifind.search import Query
from ifind.search.engines.whooshtrec import Whooshtrec

from whoosh.index import open_dir
from whoosh.qparser import QueryParser

whoosh_path = sys.argv[1]
stopwords_path = sys.argv[2]

page = 3
page_len = 10

search_engine = Whooshtrec(whoosh_index_dir=whoosh_path,
                           stopwords_file=stopwords_path,
                           model=1,
                           newschema=True)

query = Query('wildlife extinction')
query.skip = page
query.top = page_len

response = search_engine.search(query)

for result in response:
    print '{0} {1}'.format(result.whooshid, result.rank)

print response.result_total
print response.results_on_page
print response.actual_page