Пример #1
0
 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)
Пример #2
0
 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(File(os.path.join(self.directory,
                                                        INDEX_DIR)))
     self.taxoDir = FSDirectory.open(File(os.path.join(self.directory,
                                                       TAXONOMY_DIR)))
Пример #3
0
 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                             INDEX_DIR)))
     self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                            TAXONOMY_DIR)))
     # FacetConfig
     self.facets_config = FacetsConfig()
     self.facets_config.setHierarchical("Categories", True)
     self.facets_config.setMultiValued("Categories", True)
Пример #4
0
    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)
Пример #5
0
def search(querystr):
    print('lucene', lucene.VERSION)
    # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    directory = FSDirectory.open(Paths.get("index"))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    q = QueryParser("name", analyzer).parse(querystr)

    hitsPerPage = 20
    docs = searcher.search(q, hitsPerPage)
    hits = docs.scoreDocs

    people = []
    number = 1
    for hit in hits:
        # print(hit.doc, hit.score)
        d = searcher.doc(hit.doc)
        person = {}
        print(number, d.get("name"))
        person['Name'] = (d.get("name"))
        person['Birth date'] = (d.get("birth_date"))
        person['Death date'] = (d.get("death_date"))
        person['Birth note'] = (d.get("birth_note"))
        person['Death note'] = (d.get("death_note"))
        people.append(person)
        number += 1

    return people
Пример #6
0
def l_searcher(query_string, directory, number_documents):
	lucene.initVM()

	# analyzer = StandardAnalyzer()
	reader = DirectoryReader.open(FSDirectory.open(Paths.get(directory)))
	searcher = IndexSearcher(reader)

	# Top 'n' documents as result
	topN = number_documents

	try:
		# query = QueryParser("question", analyzer).parse(query_string)
		query = FuzzyQuery(Term("question", query_string), 2)
		print("The query was: {}".format(query))

		hits = searcher.search(query, topN)

		print("The hits were: ")

		options = []
		options_answers = []

		# print(hits.totalHits)

		for hit in hits.scoreDocs:
			print(hit.doc)
			# print(hit.score, hit.doc, hit.toString())
			doc = searcher.doc(hit.doc)
			options_answers.append(doc.get("answer"))
			options.append(doc.get("question"))
			# print(doc.get("answer"))

		return options, options_answers
	except IndexError:
		return None
Пример #7
0
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader):
    docs_lookup = dict()
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)
    parser = QueryParser('contents', StandardAnalyzer())

    logging.warning(
        'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED'
    )

    for object in objects:
        tokens = object.split(' ')

        doc_sets = []
        for token in tokens:
            q = parser.parse(f'"{token}"')
            # TODO maybe use minimum score
            topdocs = searcher.search(q, 99999999)
            results = set([topdoc.doc for topdoc in topdocs.scoreDocs])
            doc_sets.append(results)
        docs_lookup[object] = set.intersection(*doc_sets)

    return docs_lookup, reader
Пример #8
0
    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
Пример #9
0
    def __init__(self, baseDir, indexDirectory="IR.Index"):
        """
        :param baseDir: The directory where this querrier is run
        :param indexDirectory: Directory of indices, default value = 'IR.Index'
        """
        indexDir = FSDirectory.open(Paths.get(os.path.join(baseDir, indexDirectory)))

        self.reader = DirectoryReader.open(indexDir)
Пример #10
0
 def __init__(self, dir, data_file):
     self.dir = dir
     self.data_file = data_file
     index_dir = FSDirectory.open(Paths.get(self.dir))
     analyzer = StandardAnalyzer()
     writer_config = IndexWriterConfig(analyzer)
     writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(index_dir, writer_config)
Пример #11
0
def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
Пример #12
0
 def create_index_dir(self):
     """
     Create the directory where index is stored
     :return: index directory
     """
     path = Paths.get('index')
     indexDir = FSDirectory.open(path)
     return indexDir
Пример #13
0
def createIndexWriter(indexDir):
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    directory = FSDirectory.open(Paths.get(indexDir))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    #config = config.setRAMBufferSizeMB(ramBufferSize)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    return IndexWriter(directory, config)
Пример #14
0
def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
Пример #15
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Пример #16
0
 def __init__(self, index_path, update=False):
     dir = FSDirectory.open(Paths.get(index_path))
     analyzer = StandardAnalyzer()
     iwc = IndexWriterConfig(analyzer)
     if update:
         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
     else:
         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     self.writer = IndexWriter(dir, iwc)
Пример #17
0
 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0)  # faster
     config.setUseCompoundFile(False)  # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher
Пример #18
0
 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0) # faster
     config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher
Пример #19
0
def index_scan():
    print("Scanning the index")
    #pdb.set_trace()
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    fields = MultiFields.getFields(reader)
    for field in fields:
        term = MultiFields.getTerms(reader,field)
        print(field, "->" , term)
Пример #20
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(
        Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Пример #21
0
def getLucene(path):
    directory = FSDirectory.open(File(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(Version.LATEST, analyzer)
    mergePolicy = config.getMergePolicy()
    sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    config.setMergePolicy(sortingMergePolicy)
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Пример #22
0
    def __init__(self):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])

        paths_dict = util.getPaths()
        # textSearcher = TextSearcher(paths_dict['fs_directory'])
        fs_directory = FSDirectory.open(Paths.get(paths_dict['fs_directory']))
        index_reader = DirectoryReader.open(fs_directory)
        self.lucene_dictionary = LuceneDictionary(index_reader, 'contents')
        self.analyzer = StandardAnalyzer()
        self.searcher = IndexSearcher(index_reader)
        self.formatter = SimpleHTMLFormatter()
Пример #23
0
 def init_index(self):
     """
         Initializes the lucene index, as well as creates an StandardAnalyzer and an IndexSearcher.
         Returns:
             vm: The initialized Java VM
             analyzer: StandardAnalyzer word analizer.
             searcher: Searcher over the lucene index.
     """
     self.vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     ldir = FSDirectory.open(Paths.get(settings.LUCENE_INDEX))
     self.analyzer = StandardAnalyzer()
     self.searcher = IndexSearcher(DirectoryReader.open(ldir))
Пример #24
0
def open_writer(path):
    from java.io import File
    from org.apache.lucene.analysis.core import WhitespaceAnalyzer
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.index import IndexWriter, IndexWriterConfig
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    directory = FSDirectory.open(File(path))
    analyzer = StandardAnalyzer(Version.LUCENE_43)
    config = IndexWriterConfig(Version.LUCENE_43, analyzer)
    writer = IndexWriter(directory, config)
    return writer
Пример #25
0
def createIndex():
    print(lucene.VERSION)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    index = FSDirectory.open(Paths.get('index'))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index, config)

    openCSV('../output/outputSpark_full_index.csv', writer)
    writer.close()
Пример #26
0
def retrieve_wiki(text_query, index_directory_name):
    lucene.initVM()
    directory = FSDirectory.open(File(index_directory_name))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    
    txt =text_query
    query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('contents')    
Пример #27
0
    def __init__(self, indexDir, root="testdocs"):
        # create and open an index writer
        indexDir = FSDirectory.open(Paths.get(indexDir))

        # TODO make appropriate analyzer add to config
        analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        self.authorcount = 0
        self.titlecount = 0
        self.errorcount = 0

        self.indexDocs(root, iw)
Пример #28
0
    def __init__(self, index_dir):
        """

        :param index_dir: the dir where to store the index.
        """
        self.indexDir = index_dir
        if not os.path.exists(index_dir):
            os.mkdir(index_dir)
        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=self.ENGLISH_STOP_WORDS_SET)
        conf = IndexWriterConfig(self.analyzer)
        conf.setUseCompoundFile(False)
        directory = FSDirectory.open(Paths.get(index_dir))
        self.writer = IndexWriter(directory, conf)
Пример #29
0
    def search(self, query):
        lucene.initVM()
        luceneDirectory = "/index/"

        path = str(os.path.abspath(os.getcwd()) + luceneDirectory)
        directory = FSDirectory.open(Paths.get(path))
        reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(reader)
        analyzer = StandardAnalyzer()

        #args = len(sys.argv) - 1

        #if args < 1:
        #   print ("\n No query was submitted! \n")
        #else:
        #query_string = ""
        #position = 1
        #while(args >= position):
        #query_string = query_string + str(sys.argv[position]) + " "
        #position = position + 1

        print("Searching for '" + query + "'")

        fields_to_search = ["text", "page title", "date"]
        filter_date = 'date:"May 25"'

        filtered_query = filter_date + "AND " + query

        parser = MultiFieldQueryParser(fields_to_search, analyzer)
        updated_query = MultiFieldQueryParser.parse(parser, filtered_query)
        scored_documents = searcher.search(updated_query,
                                           10).scoreDocs  # array of docs

        print("Found " + str((len(scored_documents))) +
              " matches in the collection.")

        results = []
        for doc in scored_documents:
            scoredTweet = dict()
            scoredTweet['score'] = doc.score
            result = searcher.doc(doc.doc)
            scoredTweet['username'] = result.get("username")
            scoredTweet['tweet_body'] = result.get("text")
            scoredTweet['date'] = result.get("date")
            results.append(scoredTweet)
            print(scoredTweet)

        return results
Пример #30
0
def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)
Пример #31
0
def index_wiki(wiki_xmlfile, index_directory_name):
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
Пример #32
0
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()
Пример #33
0
def find_frequencies_wikipedia(terms: List[str], index_location: str):
    """Find frequencies using a Lucene index of wikipedia."""
    # TODO doesn't find any n>1 grams due to missing location index on contents!

    logger.warning('Not working! Does not find any n>1 grams')
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get(index_location)
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)

    freqs = {}
    for term_str in tqdm.tqdm(terms):
        term = Term("contents", term_str)
        freq = reader.totalTermFreq(term)
        freqs[term_str] = freq

    reader.close()
    return freqs
Пример #34
0
def custom_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    print rootdir
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	year = entry['publ_year']
    	
      fname = short_title + CONTENT_EXT
      results[fname] = year;
Пример #35
0
    def index(self):
        # if exists sent_index, delete and create a new one
        doc_tool.cleardir(index_root)
        doc_tool.mkdir(index_root)

        index_dir = FSDirectory.open(Paths.get(index_root))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_dir, writer_config)

        ft1 = FieldType()
        ft1.setStored(True)
        ft1.setIndexOptions(IndexOptions.NONE)

        ft2 = FieldType()
        ft2.setStored(False)
        ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        doc_list = self.doc()
        file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc")
        file_list = os.listdir(file_path)

        num = 0
        for file in file_list:
            docs = doc_tool.load_json_file(file_path, file)
            for page_identifier in docs:
                if page_identifier in doc_list:
                    num += 1
                    for sent_number in docs[page_identifier]:
                        sentence_text = self.process_sent(
                            docs[page_identifier][sent_number])
                        doc = Document()
                        doc.add(Field("page_identifier", page_identifier, ft1))
                        doc.add(Field("sentence_number", sent_number, ft1))
                        doc.add(Field("sentence_text", sentence_text, ft2))
                        writer.addDocument(doc)
                    print(num)

        writer.commit()
        writer.close()
        index_dir.close()
Пример #36
0
def do_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    print os.path.abspath(os.path.pardir)
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    results = []
    for hit in hits:
        doc = searcher.doc(hit.doc);
        entry_id = doc.get('entry_id')

        entry = entry_map.get(entry_id)
        #print 'entry:', entry
        score = hit.score
        #print 'Hit:', entry['short_title'], score
        results.append((score, doc, entry))
        
    return results
Пример #37
0
def l_indexer(directory, load_path):
	lucene.initVM()

	# index_dir = SimpleFSDirectory(File(directory))
	index_dir = FSDirectory.open(Paths.get(directory))
	writer_config = IndexWriterConfig(PortugueseAnalyzer())
	# writer_config = IndexWriterConfig(customPortugueseAnalyser())
	writer = IndexWriter(index_dir, writer_config)

	with open(load_path) as subtles_file:
		subtles_corpus = subtles_file.read().splitlines()

	for i in range(0, len(subtles_corpus), 2):
		doc = Document()
		doc.add(Field("question", subtles_corpus[i], StringField.TYPE_STORED))
		doc.add(Field("answer", subtles_corpus[i+1], StringField.TYPE_STORED))

		writer.addDocument(doc)

	writer.close()
	print("Index successfully created!")
Пример #38
0
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format
Пример #39
0
 def __index(self, emailInfo):
     from org.apache.lucene.index import IndexWriterConfig
     from org.apache.lucene.util import Version
     from org.apache.lucene.analysis.standard import StandardAnalyzer
     analyser = StandardAnalyzer(Version.LUCENE_33)
     conf = IndexWriterConfig(Version.LUCENE_33, analyser)
     from org.apache.lucene.store import FSDirectory
     from java.io import File
     storage = File.createTempFile(u'Tubelight-', '.index')
     storage.delete()
     storage.mkdir()
     storage.deleteOnExit()
     self.storage = storage.getAbsolutePath()
     from java.io import File
     self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx')
     directory = FSDirectory.open(storage)
     from org.apache.lucene.index import IndexWriter
     iw = IndexWriter(directory, conf)
     from us.d8u.tubelight import Configuration
     addr = emailInfo[Configuration.EmailAddressKey]
     (username, server) = addr.split('@')
     from java.lang import System
     System.setProperty("mail.imap.partialfetch", "false")
     urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey])))
     from javax.mail import Session
     session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey))
     session.connect(server, username, emailInfo[Configuration.EmailPasswordKey])
     folder = session.getDefaultFolder()
     for m in folder.getMessages():
         from org.apache.lucene.document import Document
         d = Document()
         subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED)
         toSrc = u''
         toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()]
         to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED)
         d.add(to)
         d.add(subject)
         iw.addDocument(d)
     iw.commit()
     self.searcher = IndexSearcher(directory)
Пример #40
0
def retrieving(searchword):
    indexPath = File("indexOut/").toPath()
    indexDir = FSDirectory.open(indexPath)
    reader = DirectoryReader.open(indexDir)
    idxDocs = reader.maxDoc()
    print("We have ", idxDocs, " indexed documents")
    searcher = IndexSearcher(reader)
    idx_analyzer = EnglishAnalyzer()
    #Search for the input term in field stored as text
    # To look into multiple fields, try  MultiFieldQueryParser, but it is not recommended.
    # Its best to club everything we want to search into a single search field and try WildCard matching on it
    query = QueryParser("text", idx_analyzer).parse(searchword)
    MAX = 1000
    hits = searcher.search(query, MAX)
    print ("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    try:
        for hit in hits.scoreDocs:
            print (hit.score, hit.doc, hit.toString())
            doc = searcher.doc(hit.doc)
            print (doc.get("text").encode("utf-8"))
    except:
        print("Could not find the word")
Пример #41
0
    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)
Пример #42
0
    def __init__(self, dest=None):
        """
        create a apache lucene indexer

        input:
            dest    destination to store index information. If not set, use
                    RAM.

        """
        # where to store information file or ram
        if dest:
            _dir = FSDirectory.open(java.io.File(dest))
        else:
            _dir = RAMDirectory()
        self.directory = _dir

        # analyser
        self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT)

        # index writer
        cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser)
        cfg.setDefaultWriteLockTimeout(6000)
        self.idx_writer = IndexWriter(self.directory, cfg)
Пример #43
0
def getReader(path):
    return DirectoryReader.open(FSDirectory.open(Paths.get(path)))
Пример #44
0
def getReader(path):
    return DirectoryReader.open(FSDirectory.open(File(path)))
Пример #45
0
        # Use MoreLikeThis query by document technology
        mlt = MoreLikeThis(reader)
        mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"])
        mlt.setMinTermFreq(0)
        mlt.setMinDocFreq(0)
        mlt.setAnalyzer(self.analyzer)
        mlt_query = mlt.like(results.scoreDocs[0].doc)

        # Filter the original film
        filtered_query = BooleanQuery()
        filtered_query.add(mlt_query, BooleanClause.Occur.MUST)
        filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT)
        score_docs = self.searcher.search(filtered_query, count).scoreDocs

        return self._retrieve_in_order(score_docs)


# Initialize Lucene
lucene.initVM()
logger = logging.getLogger(__name__)
logger.info('Initialising Lucene VM')
base_dir = os.path.abspath(os.path.curdir)
index_file = os.path.join(base_dir, settings.LUCENE['PATH'])
index = FSDirectory.open(File(index_file))
try:
    reader = DirectoryReader.open(index)
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
except lucene.JavaError as e:
    logger.error('Lucene not loaded')
Пример #46
0
"""

# lucene modules needed for this script
import lucene
from java.io import File
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, TextField

# start Java VM 
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

# indexing directory
indexDir = FSDirectory.open(File("lucene_index.Index"))

# input which will be indexed with Lucene
title1 = "text of title1"
title2 = "title2"
abstract1 = "abstract1 has many words, e.g. hellow world can be the text"
abstract2 = "text of abstract2"

# configure indexing
config = IndexWriterConfig(Version.LUCENE_CURRENT, WhitespaceAnalyzer(Version.LUCENE_CURRENT))
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iw = IndexWriter(indexDir, config)

# count number of documents processed
nDocsAdded = 0