예제 #1
0
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
예제 #2
0
    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in ["description"]:
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    #tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    #CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    #stopwords
                    temp_2 = []
                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    #stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    #stopwords
                    temp_4 = []
                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    #query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called"
        ]:  # "extends", "annotations", "literals"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query
예제 #3
0
class SearchIndex(object):

    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)


    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
예제 #4
0
    def testOverrideBooleanQuery(self):
        class TestQueryParser(BooleanTestMixin, PythonQueryParser):
            def getFieldQuery_quoted(_self, field, queryText, quoted):
                return super(TestQueryParser,
                             _self).getFieldQuery_quoted_super(
                                 field, queryText, quoted)

        qp = TestQueryParser(Version.LUCENE_CURRENT, 'all',
                             StandardAnalyzer(Version.LUCENE_CURRENT))

        q = qp.parse("foo bar")
        self.assertEquals(str(q), "all:foo all:bar all:extra_clause")
예제 #5
0
    def __enter__(self):
        """
        Used by "with" statement. Like an "open" / "init" method.
        """
        if lucene.getVMEnv() is None:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        index_path = Path(self.index_root_loc).joinpath('%s/' %
                                                        self.index_subdir_name)
        index_path.mkdir(parents=True, exist_ok=True)
        store = SimpleFSDirectory(Paths.get(str(index_path)))
        self.analyzer = StandardAnalyzer()
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # IndexWriter
        self.writer = IndexWriter(store, config)
        # IndexReader
        self.reader = DirectoryReader.open(self.writer)
        # IndexSearcher
        self.searcher = IndexSearcher(self.reader)

        return self
예제 #6
0
    def __init__(self, indexDir):
        if not os.path.exists(indexDir):
            os.mkdir(indexDir)

        store = SimpleFSDirectory(File(indexDir))

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        self.writer = IndexWriter(store, config)
예제 #7
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()
예제 #8
0
    def __init__(self, path, topn=DEF_TOPN):

        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.path = path
        self.topn = topn
        self._analyzer = StandardAnalyzer()
        self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path)))
        self._searcher = IndexSearcher(DirectoryReader.open(self._store))
        self.purpose_is_w2v = None
        self.purpose_is_not_w2v = None
        self.mechanics_is_w2v = None
        self.mechanics_is_not_w2v = None
예제 #9
0
    def testThroughLayerException(self):
        class TestException(Exception):
            pass

        class TestQueryParser(PythonQueryParser):
            def getFieldQuery_quoted(_self, field, queryText, quoted):
                raise TestException("TestException")

        qp = TestQueryParser('all', StandardAnalyzer())

        with self.assertRaises(TestException):
            qp.parse("foo bar")
예제 #10
0
 def init_index(self):
     """
         Initializes the lucene index, as well as creates an StandardAnalyzer and an IndexSearcher.
         Returns:
             vm: The initialized Java VM
             analyzer: StandardAnalyzer word analizer.
             searcher: Searcher over the lucene index.
     """
     self.vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     ldir = FSDirectory.open(Paths.get(settings.LUCENE_INDEX))
     self.analyzer = StandardAnalyzer()
     self.searcher = IndexSearcher(DirectoryReader.open(ldir))
예제 #11
0
def func_mid(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, "mid", analyzer).parse(command)
    scoreDocs = searcher.search(query, 212).scoreDocs
    results = process(scoreDocs, searcher)
    return results
예제 #12
0
    def __init__(self, docDir, indexDir, analyzer):
        #set index dir
        if not os.path.exists(indexDir):
            os.makedirs(indexDir)
        self.indexDir = SimpleFSDirectory(Paths.get(indexDir))
        self.docDir = docDir

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        writerConfig = IndexWriterConfig(self.analyzer)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(self.indexDir, writerConfig)
        self.indexing()
예제 #13
0
    def index(self):
        """
        This function is used to index the preprocessed data.
        The inverted index will be saved to ./index/ folder
        business_id, name, address, categories, review and tip data are indexed.
        """
        print "INDEXING..."
        lucene.initVM()
        indexDir = SimpleFSDirectory(File("index/"))
        writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1,
                                         StandardAnalyzer())
        writer = IndexWriter(indexDir, writerConfig)

        # each business indexed as a document
        for key, business in self.data.items():
            doc = Document()
            text = ""
            doc.add(
                Field("id", business["business_id"], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field("name", business["name"], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field("address", business["full_address"], Field.Store.YES,
                      Field.Index.ANALYZED))
            cat_text = "\n".join(business["categories"])
            doc.add(
                Field("category", cat_text, Field.Store.YES,
                      Field.Index.ANALYZED))

            # combine all reviews of a business together
            review_text = ""
            for review in business["review"]:
                review_text += review["text"]
            # combine all tip of a business together
            tip_text = ""
            for tip in business["tip"]:
                tip_text += tip["text"]

            # concatenate the data to be indexed and add it as one field
            text += business["name"]
            text += business["full_address"]
            text += cat_text
            text += review_text
            text += tip_text
            doc.add(Field("text", text, Field.Store.YES, Field.Index.ANALYZED))

            # add the business doc to writer
            writer.addDocument(doc)

        writer.close()
예제 #14
0
def create_miniindex(docs):
    index_store = RAMDirectory()
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    for doc in docs:
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return index_store
예제 #15
0
def delete(indexDir: str, id: str):
    index_dir = SimpleFSDirectory(Paths.get(indexDir))
    config = IndexWriterConfig(StandardAnalyzer())

    index_writer = IndexWriter(index_dir, config)

    delete_term_query = RegexpQuery(Term('id', id))
    delete_reg_query = RegexpQuery(Term('id', id + '\..*'))

    index_writer.deleteDocuments(delete_term_query)
    index_writer.deleteDocuments(delete_reg_query)
    index_writer.commit()
    index_writer.close()
예제 #16
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data
    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
예제 #17
0
def createIndex():
    print(lucene.VERSION)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    index = FSDirectory.open(Paths.get('index'))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index, config)

    openCSV('../output/outputSpark_full_index.csv', writer)
    writer.close()
    def setUp(self):
        super(HighlighterTestCase, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        writer = self.getWriter(analyzer=self.analyzer)
        for text in self.texts:
            self.addDoc(writer, text)

        writer.commit()
        writer.close()
        self.reader = self.getReader()
        self.numHighlights = 0;
예제 #19
0
파일: somali.py 프로젝트: hzatarain/somali
def lucene_search(query, MAX, showHighlight):
    dir = os.getcwd()
    lucene.initVM()
    index_dir = SimpleFSDirectory(File(dir))
    index_reader = DirectoryReader.open(index_dir)
    lucene_searcher = IndexSearcher(index_reader)
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_48)
    my_query = QueryParser(Version.LUCENE_48, "text",
                           lucene_analyzer).parse(query)
    #We can define the MAX number of results (default 10)
    total_hits = lucene_searcher.search(my_query, MAX)

    query_scorer = QueryScorer(my_query)
    formatter = SimpleHTMLFormatter()
    highlighter = Highlighter(formatter, query_scorer)
    # Set the fragment size. We break text in to fragment of 50 characters
    fragmenter = SimpleSpanFragmenter(query_scorer, 50)
    highlighter.setTextFragmenter(fragmenter)

    print "Only shows at most %s documents" % MAX
    if showHighlight:
        print "<br>"

    for hit in total_hits.scoreDocs:

        doc = lucene_searcher.doc(hit.doc)
        text = doc.get("text")
        ts = lucene_analyzer.tokenStream("text", StringReader(text))
        
        if showHighlight:
            print "<p>"

        print doc.get("title")

        if showHighlight:
            print "<br>"
            print highlighter.getBestFragments(ts, text, 3, "...")
            print "</p>"
예제 #20
0
def func_nr(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "name",
                        analyzer).parse(command)
    scoreDocs = searcher.search(
        query, 50, Sort([SortField("rate", SortField.Type.DOUBLE,
                                   True)])).scoreDocs
    results = process(scoreDocs, searcher)
    return results
예제 #21
0
def search(music_tags, dir_path):
    lucene.initVM()

    query_str = "content:" + " ".join(music_tags)
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    lucene_analyzer = StandardAnalyzer()
    lucene_searcher = IndexSearcher(DirectoryReader.open(index_dir))

    my_query = QueryParser("content", lucene_analyzer).parse(query_str)
    total_hits = lucene_searcher.search(my_query, 50)

    for hit in total_hits.scoreDocs:
        doc = lucene_searcher.doc(hit.doc)
        print doc
예제 #22
0
    def __init__(self, indexDir, root="testdocs"):
        # create and open an index writer
        indexDir = FSDirectory.open(Paths.get(indexDir))

        # TODO make appropriate analyzer add to config
        analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        self.authorcount = 0
        self.titlecount = 0
        self.errorcount = 0

        self.indexDocs(root, iw)
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS,
                                  analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(
                                QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(
                                self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(
                                first_movie=major_movie,
                                second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(
                                    first_movie=minor_movie,
                                    second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
예제 #24
0
def indexing(directory):
    # lucene.initVM(classpath=lucene.CLASSPATH)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        IndexFiles(directory, os.path.join(base_dir, INDEX_DIR),
                   StandardAnalyzer())
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        raise e
def populate_data(path, args):
    name = path.split('/')[-1]
    print(f"Processing {name}")
    all_senses = {}

    all_senses[args.lang1] = {}
    all_senses[args.lang2] = {}

    if args.pivot_lang is not None:
        all_senses[args.pivot_lang] = {}

    all_translation_mappings = []
    if args.pivot_lang is not None:
        all_translation_pivot1_mappings = []
        all_translation_pivot2_mappings = []

    store = SimpleFSDirectory(Paths.get(path))
    dr = DirectoryReader.open(store)
    searcher = IndexSearcher(dr)
    analyzer = StandardAnalyzer()
    query = QueryParser("title", analyzer).parse("*:*")
    topDocs = searcher.search(query, 1000000000)
    for scoreDoc in topDocs.scoreDocs:
        doc = scoreDoc.doc
        language_lemmas = searcher.doc(doc).getValues("LANGUAGE_LEMMA")
        sense_ids = searcher.doc(doc).getValues("ID_SENSE")
        for language_lemma, sense_id in zip(language_lemmas, sense_ids):
            lang = language_lemma[:2]
            lemma = language_lemma[3:]
            if language_lemma[:2] in LANGUAGES_OF_INTEREST:
                all_senses[lang] = {sense_id: lemma}
            if args.pivot_lang is not None and language_lemma[:2] == args.pivot_lang:
                all_senses[args.pivot_lang] = {sense_id: lemma}
        translation_mappings = searcher.doc(doc).getValues("TRANSLATION_MAPPING")
        create_translation_mapping(translation_mappings, all_senses, all_translation_mappings, LANGUAGES_OF_INTEREST)
        if args.pivot_lang is not None:
            create_translation_mapping(translation_mappings, all_senses, all_translation_pivot1_mappings, [args.lang1, args.pivot_lang])
            create_translation_mapping(translation_mappings, all_senses, all_translation_pivot2_mappings, [args.lang2, args.pivot_lang])

    output = open(f'{args.internal_data_path}/{name}.pkl', 'wb')
    pickle.dump(all_translation_mappings, output)
    output.close()
    if args.pivot_lang is not None:
        output = open(f'{args.internal_data_path}/{name}_{args.lang1}-{args.pivot_lang}.pkl', 'wb')
        pickle.dump(all_translation_pivot1_mappings, output)
        output.close()
        output = open(f'{args.internal_data_path}/{name}_{args.lang2}-{args.pivot_lang}.pkl', 'wb')
        pickle.dump(all_translation_pivot2_mappings, output)
        output.close()
예제 #26
0
 def GET(self, name):
     STORE_DIR_GOOD = "index_good"
     STORE_DIR_BAD = "index_bad"
     vm_env.attachCurrentThread()
     directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD))
     searcher_good = IndexSearcher(DirectoryReader.open(directory_good))
     directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD))
     searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad))
     analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     user_data = web.input(name=None)
     command = yourInput(user_data.shop)
     res = Run_GoodRate(searcher_good, searcher_bad, analyzer, command,
                        user_data.brand)
     res.append(command)
     return render.SearchResult(res)
def main():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    queries = makeQueryList(args["queryFile"])
    print 'lucene', lucene.VERSION
    print "\n"

    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    print directory.getDirectory()
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = StandardAnalyzer()

    run(searcher, analyzer, queries)
    del searcher
예제 #28
0
    def doc_search(self, keywords):

        analyzer = StandardAnalyzer()
        parser = QueryParser('Title', analyzer)
        query = parser.parse(keywords)

        try:
            collector = TopScoreDocCollector.create(3000)
            self.lSearcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

        except RuntimeError:
            print "Score docoment run fail"
        self.hits = hits
        return hits
예제 #29
0
    def search(self, query):
        lucene.initVM()
        luceneDirectory = "/index/"

        path = str(os.path.abspath(os.getcwd()) + luceneDirectory)
        directory = FSDirectory.open(Paths.get(path))
        reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(reader)
        analyzer = StandardAnalyzer()

        #args = len(sys.argv) - 1

        #if args < 1:
        #   print ("\n No query was submitted! \n")
        #else:
        #query_string = ""
        #position = 1
        #while(args >= position):
        #query_string = query_string + str(sys.argv[position]) + " "
        #position = position + 1

        print("Searching for '" + query + "'")

        fields_to_search = ["text", "page title", "date"]
        filter_date = 'date:"May 25"'

        filtered_query = filter_date + "AND " + query

        parser = MultiFieldQueryParser(fields_to_search, analyzer)
        updated_query = MultiFieldQueryParser.parse(parser, filtered_query)
        scored_documents = searcher.search(updated_query,
                                           10).scoreDocs  # array of docs

        print("Found " + str((len(scored_documents))) +
              " matches in the collection.")

        results = []
        for doc in scored_documents:
            scoredTweet = dict()
            scoredTweet['score'] = doc.score
            result = searcher.doc(doc.doc)
            scoredTweet['username'] = result.get("username")
            scoredTweet['tweet_body'] = result.get("text")
            scoredTweet['date'] = result.get("date")
            results.append(scoredTweet)
            print(scoredTweet)

        return results
예제 #30
0
def index(infile, TitlemagNoneZijn, limit=10000):
    try:
        context = etree.iterparse(infile)
    except:
        print("cannot open file: {}".format(infile))
        return
    store = openStore()
    writer = None
    p = re.compile(r'<.*?>')
    try:
        analyzer = StandardAnalyzer()
        writer = getWriter(store, analyzer, True)
        counter = 0
        countwithtitle = 0
        for event, elem in context:
            if (counter > limit):
                break
            ## debugging info om te kijken hoe snel er word ge indext
            # print(counter)
            # print(countwithtitle)
            counter += 1
            doc = Document()
            hasTitle = False
            for key in elem.attrib:
                # neem een paar tags
                if key in [
                        "CreationDate", "Score", "Body", "CommentCount",
                        "LastActivityDate", "Id", "Tags", "Title",
                        "AnswerCount", "FavoriteCount"
                ]:
                    if key == "Title":
                        # print(elem.attrib[key])
                        countwithtitle += 1
                        hasTitle = True

                    doc.add(
                        Field(key, p.sub('', elem.attrib[key]),
                              TextField.TYPE_STORED))
            if (hasTitle or TitlemagNoneZijn):
                writer.addDocument(doc)
            elem.clear()
            for ancestor in elem.xpath('ancestor-or-self::*'):
                while ancestor.getprevious() is not None:
                    del ancestor.getparent()[0]

    finally:
        del context
        writer.close()
예제 #31
0
def main():
    try:
        print "Indexing..."
        #########################################  경   로  ####################################
        indexDestination = File(
            "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": analyzer,
            "extends": analyzer,
            "used_classes": analyzer,
            "methods": analyzer,
            "class_instance_creation": analyzer,
            "methods_called": analyzer,
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
        # 	 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
        # 	 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
        # 	 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
        # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        # writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        counter = Counter()
        index_code_snippet(writer, counter)
        writer.commit()
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
예제 #32
0
    def setUp(self):
        super(BooleanOrTestCase, self).setUp()

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        d = Document()
        d.add(Field(self.FIELD_T, "Optimize not deleting all files",
                    TextField.TYPE_STORED))
        d.add(Field(self.FIELD_C,
                    "Deleted When I run an optimize in our production environment.",
                    TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.close()

        self.searcher = self.getSearcher()
예제 #33
0
def rollback(collection_name):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(direc, config)

    writer.rollback()
    writer.close()
예제 #34
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
예제 #35
0

class PorterStemmerAnalyzer(PythonAnalyzer):
    def createComponents(self, fieldName, reader):
        source = StandardTokenizer(Version.LUCENE_CURRENT, reader)
        filter = StandardFilter(Version.LUCENE_CURRENT, source)
        filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter)
        filter = PorterStemFilter(filter)
        filter = StopFilter(Version.LUCENE_CURRENT, filter,
        StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        return self.TokenStreamComponents(source, filter)



lucene.initVM(vmargs=['-Djava.awt.headless=true'])
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
input = 'this is a test string for Analyzer'
ts = analyzer.tokenStream("dummy", StringReader(input))

#matchVersion = Version.LUCENE_XY; ##Substitute desired Lucene version for XY

offsetAtt = ts.addAttribute(OffsetAttribute.class_)
termAtt = ts.addAttribute(CharTermAttribute.class_)
#posAtt = ts.addAttribute(PartOfSpeechAttribute.class_)

def testStandard():
    ts.reset(); ##Resets this stream to the beginning. (Required
    while ts.incrementToken():
        #print ts.r
        #print ts.reflectAsString(True)
        print offsetAtt.startOffset()
class HighlighterTestCase(PyLuceneTestCase):
    """
    Unit tests ported from Java Lucene.
    2004 by Yura Smolsky ;)
    """

    FIELD_NAME = "contents"
    texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem.  Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented."
              "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem."
              "From http://cognexus.org/id42.htm"
              "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems.  Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches."
              "This text has a typo in referring to whicked problems" ];

    def __init__(self, *args):
        super(HighlighterTestCase, self).__init__(*args)

        self.parser = QueryParser(Version.LUCENE_CURRENT, self.FIELD_NAME,
                                  StandardAnalyzer(Version.LUCENE_CURRENT))

    def setUp(self):
        super(HighlighterTestCase, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        writer = self.getWriter(analyzer=self.analyzer)
        for text in self.texts:
            self.addDoc(writer, text)

        writer.commit()
        writer.close()
        self.reader = self.getReader()
        self.numHighlights = 0;

    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result

        # Not sure we can assert anything here - just running to check we don't
        # throw any exceptions

    def testGetBestFragmentsSimpleQuery(self):

        self.doSearching("Wicked")
        self.doStandardHighlights()
        self.assert_(self.numHighlights == 3,
                     ("Failed to find correct number of highlights, %d found"
                      %(self.numHighlights)))
        
    def doSearching(self, queryString):

        self.searcher = self.getSearcher()
        self.query = self.parser.parse(queryString)
        # for any multi-term queries to work (prefix, wildcard, range,
        # fuzzy etc) you must use a rewritten query!
        self.query = self.query.rewrite(self.reader)

        print "Searching for:", self.query.toString(self.FIELD_NAME)
        self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs
        self.numHighlights = 0

    def doStandardHighlights(self):
        
        formatter = TestFormatter(self)
        
        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream,
                                                  text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
            
    def countHighlightTerm(self):

        self.numHighlights += 1 # update stats used in assertions
        
    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)