예제 #1
0
class SearchIndex(object):
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  self.analyzer)

    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
예제 #2
0
class SearchIndex(object):

    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)


    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
예제 #3
0
파일: somali.py 프로젝트: hzatarain/somali
def lucene_search(query, MAX, showHighlight):
    dir = os.getcwd()
    lucene.initVM()
    index_dir = SimpleFSDirectory(File(dir))
    index_reader = DirectoryReader.open(index_dir)
    lucene_searcher = IndexSearcher(index_reader)
    lucene_analyzer = StandardAnalyzer(Version.LUCENE_48)
    my_query = QueryParser(Version.LUCENE_48, "text",
                           lucene_analyzer).parse(query)
    #We can define the MAX number of results (default 10)
    total_hits = lucene_searcher.search(my_query, MAX)

    query_scorer = QueryScorer(my_query)
    formatter = SimpleHTMLFormatter()
    highlighter = Highlighter(formatter, query_scorer)
    # Set the fragment size. We break text in to fragment of 50 characters
    fragmenter = SimpleSpanFragmenter(query_scorer, 50)
    highlighter.setTextFragmenter(fragmenter)

    print "Only shows at most %s documents" % MAX
    if showHighlight:
        print "<br>"

    for hit in total_hits.scoreDocs:

        doc = lucene_searcher.doc(hit.doc)
        text = doc.get("text")
        ts = lucene_analyzer.tokenStream("text", StringReader(text))
        
        if showHighlight:
            print "<p>"

        print doc.get("title")

        if showHighlight:
            print "<br>"
            print highlighter.getBestFragments(ts, text, 3, "...")
            print "</p>"
예제 #4
0
class HighlighterTestCase(PyLuceneTestCase):
    """
    Unit tests ported from Java Lucene.
    2004 by Yura Smolsky ;)
    """

    FIELD_NAME = "contents"
    texts = [
        "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem.  Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented."
        "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem."
        "From http://cognexus.org/id42.htm"
        "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems.  Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches."
        "This text has a typo in referring to whicked problems"
    ]

    def __init__(self, *args):
        super(HighlighterTestCase, self).__init__(*args)

        self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer())

    def setUp(self):
        super(HighlighterTestCase, self).setUp()

        self.analyzer = StandardAnalyzer()

        writer = self.getWriter(analyzer=self.analyzer)
        for text in self.texts:
            self.addDoc(writer, text)

        writer.commit()
        writer.close()
        self.reader = self.getReader()
        self.numHighlights = 0

    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result

        # Not sure we can assert anything here - just running to check we don't
        # throw any exceptions

    def testGetBestFragmentsSimpleQuery(self):

        self.doSearching("Wicked")
        self.doStandardHighlights()
        self.assert_(self.numHighlights == 3,
                     ("Failed to find correct number of highlights, %d found" %
                      (self.numHighlights)))

    def doSearching(self, queryString):

        self.searcher = self.getSearcher()
        self.query = self.parser.parse(queryString)
        # for any multi-term queries to work (prefix, wildcard, range,
        # fuzzy etc) you must use a rewritten query!
        self.query = self.query.rewrite(self.reader)

        print "Searching for:", self.query.toString(self.FIELD_NAME)
        self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs
        self.numHighlights = 0

    def doStandardHighlights(self):

        formatter = TestFormatter(self)

        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result

    def countHighlightTerm(self):

        self.numHighlights += 1  # update stats used in assertions

    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
예제 #5
0
class LuceneManager(object):
    def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'):
        self.index_root_loc = index_root_loc
        self.index_subdir_name = index_subdir_name

    def __enter__(self):
        """
        Used by "with" statement. Like an "open" / "init" method.
        """
        if lucene.getVMEnv() is None:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        index_path = Path(self.index_root_loc).joinpath('%s/' %
                                                        self.index_subdir_name)
        index_path.mkdir(parents=True, exist_ok=True)
        store = SimpleFSDirectory(Paths.get(str(index_path)))
        self.analyzer = StandardAnalyzer()
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # IndexWriter
        self.writer = IndexWriter(store, config)
        # IndexReader
        self.reader = DirectoryReader.open(self.writer)
        # IndexSearcher
        self.searcher = IndexSearcher(self.reader)

        return self

    def insert(self, document):
        self.writer.addDocument(document)
        return document['key']

    def delete(self, key):
        self.writer.deleteDocuments(Term('key', key))
        return key

    def delete_all(self):
        self.writer.deleteAll()

    def num_docs(self):
        return self.reader.numDocs()

    def update(self, key, document):
        # atomic delete and add
        self.writer.updateDocument(Term('key', key), document)
        return key

    def exists(self, key):
        boolean_query = BooleanQuery.Builder()
        boolean_query.add(TermQuery(Term('key', key)),
                          BooleanClause.Occur.MUST)
        results = self.searcher.search(boolean_query.build(), 1)
        return results.totalHits > 0

    def commit(self):
        self.writer.commit()
        # make IndexReader reflect index updates
        # TODO: try IndexReader.isCurrent()
        new_reader = DirectoryReader.openIfChanged(self.reader)
        if new_reader is not None:
            self.reader.close()  # note: not thread safe, may need to revisit
            self.reader = new_reader
            self.searcher = IndexSearcher(self.reader)

    def _process_search_result(self, result, highlighter=None):
        docid = result.doc  # this is not a stable identifier
        # obtain document through an IndexReader
        doc = self.searcher.doc(docid)
        # doc.getFields() -> field.name(), field.stringValue()
        # use highlighter to extract relevant part of body
        highlighted_text = ''
        if highlighter:
            contents = doc['body']
            token_stream = self.analyzer.tokenStream('body', contents)
            n_fragments = 3
            fragment_separator = '...'
            highlighted_text = highlighter.getBestFragments(
                token_stream, contents, n_fragments, fragment_separator)
        return {
            'fullpath': doc['fullpath'],
            'last_modified_time': doc['last_modified_time'],
            'score': result.score,
            'excerpt': highlighted_text
        }

    def search(self, terms, n_hits=5):
        """
        Run search query.
        """
        # TODO: support date range queries

        # build query
        parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer)
        #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier
        query = MultiFieldQueryParser.parse(
            parser, terms)  # https://stackoverflow.com/a/26853987/130164
        # create a highlighter
        highlighter = Highlighter(SimpleHTMLFormatter('*', '*'),
                                  QueryScorer(query))
        # execute search for top N hits
        return [
            self._process_search_result(result, highlighter)
            for result in self.searcher.search(query, n_hits).scoreDocs
        ]

    def get_all_docs(self, n_hits=1000):
        # debug method
        return [
            self._process_search_result(result) for result in
            self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs
        ]

    def __exit__(self, type, value, traceback):
        """
        Used by the "with" statement. Handles close.
        TODO: error handling
        """
        self.writer.close()
        self.reader.close()

    def debug_analyzer(self, text):
        """
        Debug what StandardAnalyzer will give on this text.
        Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html
        Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py
        """
        token_stream = self.analyzer.tokenStream('field', text)
        termAtt = token_stream.getAttribute(CharTermAttribute.class_)
        token_stream.reset()
        tokens = []
        while token_stream.incrementToken():
            #tokens.append(token_stream.reflectAsString(True))
            tokens.append(termAtt.toString())
        token_stream.end()
        token_stream.close()
        return tokens
class HighlighterTestCase(PyLuceneTestCase):
    """
    Unit tests ported from Java Lucene.
    2004 by Yura Smolsky ;)
    """

    FIELD_NAME = "contents"
    texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem.  Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented."
              "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem."
              "From http://cognexus.org/id42.htm"
              "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems.  Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches."
              "This text has a typo in referring to whicked problems" ];

    def __init__(self, *args):
        super(HighlighterTestCase, self).__init__(*args)

        self.parser = QueryParser(Version.LUCENE_CURRENT, self.FIELD_NAME,
                                  StandardAnalyzer(Version.LUCENE_CURRENT))

    def setUp(self):
        super(HighlighterTestCase, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        writer = self.getWriter(analyzer=self.analyzer)
        for text in self.texts:
            self.addDoc(writer, text)

        writer.commit()
        writer.close()
        self.reader = self.getReader()
        self.numHighlights = 0;

    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result

        # Not sure we can assert anything here - just running to check we don't
        # throw any exceptions

    def testGetBestFragmentsSimpleQuery(self):

        self.doSearching("Wicked")
        self.doStandardHighlights()
        self.assert_(self.numHighlights == 3,
                     ("Failed to find correct number of highlights, %d found"
                      %(self.numHighlights)))
        
    def doSearching(self, queryString):

        self.searcher = self.getSearcher()
        self.query = self.parser.parse(queryString)
        # for any multi-term queries to work (prefix, wildcard, range,
        # fuzzy etc) you must use a rewritten query!
        self.query = self.query.rewrite(self.reader)

        print "Searching for:", self.query.toString(self.FIELD_NAME)
        self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs
        self.numHighlights = 0

    def doStandardHighlights(self):
        
        formatter = TestFormatter(self)
        
        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream,
                                                  text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
            
    def countHighlightTerm(self):

        self.numHighlights += 1 # update stats used in assertions
        
    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
예제 #7
0
# Basic tokenizer example.

test = "This is how we do it."
tokenizer = StandardTokenizer()
tokenizer.setReader(StringReader(test))
charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class_)
tokenizer.reset()
tokens = []

while tokenizer.incrementToken():
    tokens.append(charTermAttrib.toString())
    print(tokens)  # StandardAnalyzer example.

analyzer = StandardAnalyzer()
stream = analyzer.tokenStream("", StringReader(test))
stream.reset()
tokens = []

while stream.incrementToken():
    tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
    print(tokens)

# JapaneseAnalyzer example.
analyzer = JapaneseAnalyzer()
test = "寿司が食べたい。"
stream = analyzer.tokenStream("", StringReader(test))
stream.reset()
tokens = []

while stream.incrementToken():
예제 #8
0
def rocchio(ireader,
            query,
            relevantIDs,
            nonrelevantIDs,
            maxaddedterms=5,
            tfidfthresh=0.0,
            alpha=1.0,
            beta=0.75,
            gamma=0.15):
    """
    Performs the Rocchio Query Expansion algorithm to provide additional relevant query terms.

    :param ireader: IndexReader object (you'll probably want to use the one available during search).
    :param query: Query string, the raw string from the user.
    :param relevantIDs: Document IDs of all relevant documents.
    :param nonrelevantIDs: Document IDs of all non-relevant documents.
    :param maxaddedterms: The maximum amount of terms appended to the expanded query (can be combined with tfidfthresh). Default: 5
    :param tfidfthresh: TF-IDF threshold for all query terms (can be combined with maxaddedterms). Default: 0
    :param alpha: Rocchio alpha weight (original query vector). Default: 1.0
    :param beta: Rocchio beta weight (relevant docs query vector). Default: 0.75
    :param gamma: Rocchio gamma weight (non-relevant docs query vector). Default: 0.15
    :return: Query string.

    Weight defaults were sourced from: https://nlp.stanford.edu/IR-book/html/htmledition/the-rocchio71-algorithm-1.html
    """
    # All score vectors we'll be using
    q0 = {}
    drv = {}
    dnrv = {}

    # Process query into tokens
    analyzer = StandardAnalyzer()
    stream = analyzer.tokenStream("", StringReader(query))
    stream.reset()
    tokens = []
    while stream.incrementToken():
        tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())

    # (Re)generate score vector for current query
    q0 = get_score_vector(ireader, alpha, terms=tokens)

    # Generate score vector for relevant documents
    if relevantIDs:
        drv = get_score_vector(ireader, beta, docs=relevantIDs)

    # Generate score vector for nonrelevant documents
    if nonrelevantIDs:
        dnrv = get_score_vector(ireader, gamma, docs=nonrelevantIDs)

    # Merge score vectors following Rocchio formula. Weights have already been applied
    q1 = q0
    for key, value in drv.items():
        if key in q1:
            q1[key] += value
        else:
            q1[key] = value
    for key, value in dnrv.items():
        if key in q1:
            q1[key] -= value
        else:
            q1[key] = value

    # Return all the best terms
    # Terms are narrowed down using both a TF-IDF threshold and a maximum amount of terms
    # The TF-IDF threshold is 0 (i.e. ignored) by default
    bestterms = sorted([t for t in q1.keys() if q1[t] > tfidfthresh],
                       key=lambda x: q1[x],
                       reverse=True)
    best = bestterms[:len(tokens) + maxaddedterms]
    # print([(t, q1[t]) for t in best])
    return " ".join(best)
예제 #9
0
class Indexer:
    """
    Indexer Class
    """
    (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date",
                                                   "url", "tags", "timestamp")

    def __init__(self, indexDir="", debug=False, verbose=False):
        """
        :Parameters:
        - `indexDir`: Path where the Index will be saved. (Str)
        - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean)
        - `verbose`: Provide additional information about the initialization process. (Boolean)
        """
        self.__verbose = verbose
        if indexDir != "":
            INDEX_DIR = indexDir
        else:
            INDEX_DIR = os.path.dirname(
                os.path.realpath(__file__)) + "/luceneIndex"

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)
            self.__boAppend = False
        else:
            self.__boAppend = True
        # Initialize lucene and JVM
        lucene.initVM()
        # Get index storage
        if debug:
            # Store the index in memory
            self.__indexDir = RAMDirectory()
            self.__boAppend = False
            INDEX_DIR = "RAM Memory"
        else:
            # Store an index on disk
            self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR))

        # Create Content FieldType
        self.__contentType = FieldType()
        self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.__contentType.setTokenized(True)
        self.__contentType.setStored(True)
        self.__contentType.setStoreTermVectors(True)
        self.__contentType.setStoreTermVectorPositions(True)
        self.__contentType.freeze()

        # Get the Analyzer
        self.__analyzer = StandardAnalyzer(
            StandardAnalyzer.ENGLISH_STOP_WORDS_SET)

        # Print Indexer Information
        print("Lucene version is: ", lucene.VERSION)
        print("Index Directory: ", INDEX_DIR)

    def __del__(self):
        self.__indexDir.close()

    ##################################################
    #Private Methods
    ##################################################
    @staticmethod
    def __getTimestamp(dateTime):
        """
        Converts the document's date to an integer timestamp

        :Parameters:
        - `dateTime`: Document's date  (Str)

        :Returns:
        - Date timestamp (Int)
        """
        tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ')
        sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format(
            tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min,
            tm.tm_sec)
        return int(sTime)

    @staticmethod
    def __getDateTime(timeStamp):
        """
        Converts the document's timestamp to date

        :Parameters:
        - `timeStamp`: Document's timestamp

        :Returns:
        - Date (Str)
        """
        date = datetime.datetime(year=int(timeStamp[0:4]),
                                 month=int(timeStamp[4:6]),
                                 day=int(timeStamp[6:8]),
                                 hour=int(timeStamp[8:10]),
                                 minute=int(timeStamp[10:12]),
                                 second=int(timeStamp[12:14]))
        return date.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def __qualifyTags(tags):
        """
        Creates the qualify string for tags

        :Parameters:
        - `tags`: List of document's tags

        :Return:
        - Qualify Tags (Str)
        """
        sTags = ""
        for tag in tags:
            sTags += tag + '|'
        return sTags[:-1]

    @staticmethod
    def __scatterMatrix(numDocs, freqMtx):
        print("Scattering Frequency Matrix...")
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        matrix = []
        innerMatrix = ['Term']

        #Generate Document Columns
        for docIdx in range(numDocs):
            innerMatrix.append("D{0:0>4}".format(docIdx))
        matrix.append(innerMatrix)

        #Generate Word Rows and Columns
        for word in sorted(freqMtx):
            innerMatrix = []
            innerMatrix.append(word)
            for docIdx in range(numDocs):
                try:
                    termCount = round(freqMtx[word][str(docIdx)], 3)
                    innerMatrix.append(termCount)
                except KeyError:
                    innerMatrix.append(0)
            matrix.append(innerMatrix)
            pB.updateProgress()
        return matrix

    @staticmethod
    def __saveMatrix(numDocs, freqMtx):
        pathMatrix = os.path.dirname(
            os.path.realpath(__file__)) + "/freqMtx.txt"
        fMatrix = open(pathMatrix, 'w')

        print("Saving Frequency Matrix File: ", pathMatrix)
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        # File Generation Start
        print("+========= Frequency Matrix =========+", file=fMatrix)
        print("%20s" % (' '), end=' ', file=fMatrix)
        for docIdx in range(numDocs):
            print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix)
        print(file=fMatrix)
        for word in sorted(freqMtx):
            print("%20s" % (word), end=' ', file=fMatrix)
            for docIdx in range(numDocs):
                try:
                    termCount = freqMtx[word][str(docIdx)]
                    print("%02.03f" % (termCount), end=' ', file=fMatrix)
                except KeyError:
                    print("  0  ", end=' ', file=fMatrix)
            print(file=fMatrix)
            pB.updateProgress()
        # Close File
        fMatrix.close()

    def __stemString(self, stringToStem):
        stemmedTerms = []
        tknStream = self.__analyzer.tokenStream('STEM', stringToStem)
        stemmed = SnowballFilter(tknStream, "English")
        stemmed.reset()
        while stemmed.incrementToken():
            stemmedTerms.append(
                stemmed.getAttribute(CharTermAttribute.class_).toString())

        tknStream.close()
        return stemmedTerms

    @staticmethod
    def __normalize(qVector, freqMtx):
        for term in qVector:
            for docId in freqMtx:
                if (term in freqMtx[docId]) and (freqMtx[docId][term] >
                                                 qVector[term]):
                    qVector[term] = freqMtx[docId][term]

    @staticmethod
    def __dotProduct(aVector, bVector):
        """
        Calculate Dot Product

        :Parameters:
        - `aVector`: A Vector. (Dict)
        - `bVector`: B Vector. (Dict)

        :Returns:
        - Dot Product. (Int)
        """
        dotProduct = 0
        for term in aVector:
            if term in bVector:
                product = aVector[term] * bVector[term]
                dotProduct += product

        return dotProduct

    @staticmethod
    def __magnitude(vector):
        """
        Calculate Dot Product

        :Parameters:
        - `vector`: Query Vector. (Dict)

        :Returns:
        - Vector Magnitude. (Int)
        """
        # Magnitude of the vector is the square root of the dot product of the vector with itself.
        vectorMagnitude = Indexer.__dotProduct(vector, vector)
        vectorMagnitude = math.sqrt(vectorMagnitude)

        return vectorMagnitude

    ##################################################
    #Public Methods
    ##################################################
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

    def Search(self, query, field=NAME, maxResult=1000):
        """
        Search for a document into the Lucene's Index

        :Parameters:
        - `query`: Request to be made to the Index (Str).
        - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS).
        - `maxResult`: Maximum number of results.
        """
        # Get the Index Directory
        reader = DirectoryReader.open(self.__indexDir)
        searcher = IndexSearcher(reader)
        # Create a query
        queryParser = QueryParser(field, self.__analyzer).parse(query)
        # Do a search
        hits = searcher.search(queryParser, maxResult)
        print("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, queryParser))
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score))
            print("Name: " + doc.get('name'))
            print("Tags: " + doc.get('tags') + "\n")
        reader.close()

    def StemDocument(self, docIdx):
        """
        Return an array of the document's stemmed terms

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx).get(Indexer.CONTENT)
        reader.close()

        return self.__stemString(doc)

    def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False):
        """
        Generates a Frequency Matrix of the current Index

        :Parameters:
        - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean)
        """
        freqMtx = {}  # Terms - DocumentID Matrix
        reader = DirectoryReader.open(self.__indexDir)
        numDocs = reader.numDocs()
        print("Generating Frequency Matrix...")
        pB = ProgressBar(numDocs - 1, prefix='Progress:')
        for docIdx in range(numDocs):
            termItr = self.StemDocument(docIdx)
            termSize = len(termItr)
            docStr = '{0}'.format(docIdx)
            termDict = {}
            for termText in termItr:
                if byTerms:
                    # Check if the term exists
                    if termText in freqMtx:
                        # Check if the document exists
                        if docStr in freqMtx[termText]:
                            termCount = int(
                                math.ceil(
                                    ((freqMtx[termText][docStr] * termSize) /
                                     100)))
                            freqMtx[termText].update(
                                {docStr: ((termCount + 1) / termSize) * 100})
                        else:
                            freqMtx[termText].update(
                                {docStr: (1 / termSize) * 100})
                    else:
                        termIdx = {termText: {docStr: (1 / termSize) * 100}}
                        freqMtx.update(termIdx)
                else:
                    # Check if the term exists
                    termText = termText.replace('.', '_')
                    if termText in termDict:
                        termCount = int(
                            math.ceil((termDict[termText] * termSize) / 100))
                        termDict[termText] = ((termCount + 1) / termSize) * 100
                    else:
                        termIdx = {termText: (1 / termSize) * 100}
                        termDict.update(termIdx)
            if not byTerms:
                freqMtx.update({docStr: termDict})
            pB.updateProgress()

        if saveMtx and byTerms:
            self.__saveMatrix(numDocs, freqMtx)

        if scattered and byTerms:
            freqMtx = self.__scatterMatrix(numDocs, freqMtx)

        # Close IndexReader
        reader.close()

        return freqMtx

    def GetSimilarity(self, query, freqMtx):
        """
        Cosine Similarity
        """
        qVector = {}
        qList = self.__stemString(query)
        for stem in qList:
            qVector.update({stem: 0})
        self.__normalize(qVector, freqMtx)

        qList = []
        #Get similarity between query and doc[n]
        for docIdx, dVector in freqMtx.items():
            dP = self.__dotProduct(qVector, dVector)
            qM = self.__magnitude(qVector)
            dM = self.__magnitude(dVector)
            cosSimilarity = dP / (qM * dM)
            qList.append((docIdx, cosSimilarity))

        return sorted(qList,
                      key=lambda similarity: similarity[1],
                      reverse=True)

    def AnalyzeDocument(self, docIdx):
        """
        Generates a list of (entity, relation, entity) tuples as its output.

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        gpeList = {}
        geolocator = Geocode()
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        # Load NLTK Data
        nltkPath = os.path.dirname(
            os.path.realpath(__file__)) + '/../tools/nltk_data'
        nltk.data.path.append(nltkPath)

        # Named Entity Recognition
        content = doc.get(Indexer.CONTENT)
        sentences = nltk.sent_tokenize(content)

        #ProgressBar
        print("Analazing Document {0}".format(docIdx))

        pB = ProgressBar(len(sentences), prefix='Progress:')
        # Loop over each sentence and tokenize it separately
        for sentence in sentences:
            ner = nltk.word_tokenize(sentence)
            ner = nltk.pos_tag(ner)
            ner = nltk.ne_chunk(ner)
            # Get all the Geo-Political Entities
            for subtrees in list(
                    ner.subtrees(
                        filter=lambda subtree: subtree.label() == 'GPE')):
                entityName = ' '.join([child[0] for child in subtrees])
                if entityName not in gpeList:
                    location = geolocator.GetGPE(entityName)
                    if location:
                        gpeList.update(location)
            pB.updateProgress()
        gpeList = geolocator.GetFeatureCollection(gpeList)

        return gpeList

    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content
예제 #10
0
class PorterStemmerAnalyzer(PythonAnalyzer):
    def createComponents(self, fieldName, reader):
        source = StandardTokenizer(Version.LUCENE_CURRENT, reader)
        filter = StandardFilter(Version.LUCENE_CURRENT, source)
        filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter)
        filter = PorterStemFilter(filter)
        filter = StopFilter(Version.LUCENE_CURRENT, filter,
        StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        return self.TokenStreamComponents(source, filter)



lucene.initVM(vmargs=['-Djava.awt.headless=true'])
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
input = 'this is a test string for Analyzer'
ts = analyzer.tokenStream("dummy", StringReader(input))

#matchVersion = Version.LUCENE_XY; ##Substitute desired Lucene version for XY

offsetAtt = ts.addAttribute(OffsetAttribute.class_)
termAtt = ts.addAttribute(CharTermAttribute.class_)
#posAtt = ts.addAttribute(PartOfSpeechAttribute.class_)

def testStandard():
    ts.reset(); ##Resets this stream to the beginning. (Required
    while ts.incrementToken():
        #print ts.r
        #print ts.reflectAsString(True)
        print offsetAtt.startOffset()
        print offsetAtt.endOffset()
        print termAtt.toString() #, posAtt.getPartOfSpeech()