예제 #1
0
def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f
예제 #2
0
def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f
예제 #3
0
def main():
    INDEX_DIR = "full_index1"
    DOCUMENTS_DIR = "/media/joseph/Windows8_OS/Users/Joseph/AppData/Local/lxss/home/jwymbs23/data_science_projects/french_pamphlets/frc-data-master/OCR_text/"
    # Initialize lucene and JVM
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print("lucene version is:", lucene.VERSION, '\n')

    store = getStore(INDEX_DIR)

    analyzer = getAnalyzer()

    writer = getWriter(store=store, analyzer=analyzer, create=True)

    #get list of documents
    doc_list = getDoclist(DOCUMENTS_DIR)

    ftype = FieldType()
    ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    ftype.setTokenized(True)
    ftype.setStoreTermVectors(True)
    ftype.freeze()

    for cd, doc_name in enumerate(doc_list):
        if not cd % 1000:
            print(cd, '--', len(doc_list))
        with open(doc_name, 'r') as d:
            doc_lines = d.readlines()
            full_text = ''.join([i.strip() for i in doc_lines]).lower()
            try:
                # create a document that would we added to the index
                doc = Document()

                # Add fields to this document
                #could process fname here instead of in the dataframe later
                doc.add(
                    Field("identifier",
                          doc_name.split('/')[-1], TextField.TYPE_STORED)
                )  #Store.YES))#, Field.Index.ANALYZED))
                doc.add(
                    Field("vectext", full_text, ftype)
                )  #TextField.TYPE_STORED, TermVector.YES, ))#Store.YES))#, Field.Index.ANALYZED))
                doc.add(Field("text", full_text, TextField.TYPE_STORED))
                # Add the document to the index
                writer.addDocument(doc)
            except:
                print("Failed in indexDocs: ", doc_name)
    #writer.optimize()
    writer.commit()
예제 #4
0
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
예제 #5
0
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
예제 #6
0
class Indexer:
    """
    Indexer Class
    """
    (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date",
                                                   "url", "tags", "timestamp")

    def __init__(self, indexDir="", debug=False, verbose=False):
        """
        :Parameters:
        - `indexDir`: Path where the Index will be saved. (Str)
        - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean)
        - `verbose`: Provide additional information about the initialization process. (Boolean)
        """
        self.__verbose = verbose
        if indexDir != "":
            INDEX_DIR = indexDir
        else:
            INDEX_DIR = os.path.dirname(
                os.path.realpath(__file__)) + "/luceneIndex"

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)
            self.__boAppend = False
        else:
            self.__boAppend = True
        # Initialize lucene and JVM
        lucene.initVM()
        # Get index storage
        if debug:
            # Store the index in memory
            self.__indexDir = RAMDirectory()
            self.__boAppend = False
            INDEX_DIR = "RAM Memory"
        else:
            # Store an index on disk
            self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR))

        # Create Content FieldType
        self.__contentType = FieldType()
        self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.__contentType.setTokenized(True)
        self.__contentType.setStored(True)
        self.__contentType.setStoreTermVectors(True)
        self.__contentType.setStoreTermVectorPositions(True)
        self.__contentType.freeze()

        # Get the Analyzer
        self.__analyzer = StandardAnalyzer(
            StandardAnalyzer.ENGLISH_STOP_WORDS_SET)

        # Print Indexer Information
        print("Lucene version is: ", lucene.VERSION)
        print("Index Directory: ", INDEX_DIR)

    def __del__(self):
        self.__indexDir.close()

    ##################################################
    #Private Methods
    ##################################################
    @staticmethod
    def __getTimestamp(dateTime):
        """
        Converts the document's date to an integer timestamp

        :Parameters:
        - `dateTime`: Document's date  (Str)

        :Returns:
        - Date timestamp (Int)
        """
        tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ')
        sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format(
            tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min,
            tm.tm_sec)
        return int(sTime)

    @staticmethod
    def __getDateTime(timeStamp):
        """
        Converts the document's timestamp to date

        :Parameters:
        - `timeStamp`: Document's timestamp

        :Returns:
        - Date (Str)
        """
        date = datetime.datetime(year=int(timeStamp[0:4]),
                                 month=int(timeStamp[4:6]),
                                 day=int(timeStamp[6:8]),
                                 hour=int(timeStamp[8:10]),
                                 minute=int(timeStamp[10:12]),
                                 second=int(timeStamp[12:14]))
        return date.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def __qualifyTags(tags):
        """
        Creates the qualify string for tags

        :Parameters:
        - `tags`: List of document's tags

        :Return:
        - Qualify Tags (Str)
        """
        sTags = ""
        for tag in tags:
            sTags += tag + '|'
        return sTags[:-1]

    @staticmethod
    def __scatterMatrix(numDocs, freqMtx):
        print("Scattering Frequency Matrix...")
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        matrix = []
        innerMatrix = ['Term']

        #Generate Document Columns
        for docIdx in range(numDocs):
            innerMatrix.append("D{0:0>4}".format(docIdx))
        matrix.append(innerMatrix)

        #Generate Word Rows and Columns
        for word in sorted(freqMtx):
            innerMatrix = []
            innerMatrix.append(word)
            for docIdx in range(numDocs):
                try:
                    termCount = round(freqMtx[word][str(docIdx)], 3)
                    innerMatrix.append(termCount)
                except KeyError:
                    innerMatrix.append(0)
            matrix.append(innerMatrix)
            pB.updateProgress()
        return matrix

    @staticmethod
    def __saveMatrix(numDocs, freqMtx):
        pathMatrix = os.path.dirname(
            os.path.realpath(__file__)) + "/freqMtx.txt"
        fMatrix = open(pathMatrix, 'w')

        print("Saving Frequency Matrix File: ", pathMatrix)
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        # File Generation Start
        print("+========= Frequency Matrix =========+", file=fMatrix)
        print("%20s" % (' '), end=' ', file=fMatrix)
        for docIdx in range(numDocs):
            print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix)
        print(file=fMatrix)
        for word in sorted(freqMtx):
            print("%20s" % (word), end=' ', file=fMatrix)
            for docIdx in range(numDocs):
                try:
                    termCount = freqMtx[word][str(docIdx)]
                    print("%02.03f" % (termCount), end=' ', file=fMatrix)
                except KeyError:
                    print("  0  ", end=' ', file=fMatrix)
            print(file=fMatrix)
            pB.updateProgress()
        # Close File
        fMatrix.close()

    def __stemString(self, stringToStem):
        stemmedTerms = []
        tknStream = self.__analyzer.tokenStream('STEM', stringToStem)
        stemmed = SnowballFilter(tknStream, "English")
        stemmed.reset()
        while stemmed.incrementToken():
            stemmedTerms.append(
                stemmed.getAttribute(CharTermAttribute.class_).toString())

        tknStream.close()
        return stemmedTerms

    @staticmethod
    def __normalize(qVector, freqMtx):
        for term in qVector:
            for docId in freqMtx:
                if (term in freqMtx[docId]) and (freqMtx[docId][term] >
                                                 qVector[term]):
                    qVector[term] = freqMtx[docId][term]

    @staticmethod
    def __dotProduct(aVector, bVector):
        """
        Calculate Dot Product

        :Parameters:
        - `aVector`: A Vector. (Dict)
        - `bVector`: B Vector. (Dict)

        :Returns:
        - Dot Product. (Int)
        """
        dotProduct = 0
        for term in aVector:
            if term in bVector:
                product = aVector[term] * bVector[term]
                dotProduct += product

        return dotProduct

    @staticmethod
    def __magnitude(vector):
        """
        Calculate Dot Product

        :Parameters:
        - `vector`: Query Vector. (Dict)

        :Returns:
        - Vector Magnitude. (Int)
        """
        # Magnitude of the vector is the square root of the dot product of the vector with itself.
        vectorMagnitude = Indexer.__dotProduct(vector, vector)
        vectorMagnitude = math.sqrt(vectorMagnitude)

        return vectorMagnitude

    ##################################################
    #Public Methods
    ##################################################
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

    def Search(self, query, field=NAME, maxResult=1000):
        """
        Search for a document into the Lucene's Index

        :Parameters:
        - `query`: Request to be made to the Index (Str).
        - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS).
        - `maxResult`: Maximum number of results.
        """
        # Get the Index Directory
        reader = DirectoryReader.open(self.__indexDir)
        searcher = IndexSearcher(reader)
        # Create a query
        queryParser = QueryParser(field, self.__analyzer).parse(query)
        # Do a search
        hits = searcher.search(queryParser, maxResult)
        print("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, queryParser))
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score))
            print("Name: " + doc.get('name'))
            print("Tags: " + doc.get('tags') + "\n")
        reader.close()

    def StemDocument(self, docIdx):
        """
        Return an array of the document's stemmed terms

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx).get(Indexer.CONTENT)
        reader.close()

        return self.__stemString(doc)

    def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False):
        """
        Generates a Frequency Matrix of the current Index

        :Parameters:
        - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean)
        """
        freqMtx = {}  # Terms - DocumentID Matrix
        reader = DirectoryReader.open(self.__indexDir)
        numDocs = reader.numDocs()
        print("Generating Frequency Matrix...")
        pB = ProgressBar(numDocs - 1, prefix='Progress:')
        for docIdx in range(numDocs):
            termItr = self.StemDocument(docIdx)
            termSize = len(termItr)
            docStr = '{0}'.format(docIdx)
            termDict = {}
            for termText in termItr:
                if byTerms:
                    # Check if the term exists
                    if termText in freqMtx:
                        # Check if the document exists
                        if docStr in freqMtx[termText]:
                            termCount = int(
                                math.ceil(
                                    ((freqMtx[termText][docStr] * termSize) /
                                     100)))
                            freqMtx[termText].update(
                                {docStr: ((termCount + 1) / termSize) * 100})
                        else:
                            freqMtx[termText].update(
                                {docStr: (1 / termSize) * 100})
                    else:
                        termIdx = {termText: {docStr: (1 / termSize) * 100}}
                        freqMtx.update(termIdx)
                else:
                    # Check if the term exists
                    termText = termText.replace('.', '_')
                    if termText in termDict:
                        termCount = int(
                            math.ceil((termDict[termText] * termSize) / 100))
                        termDict[termText] = ((termCount + 1) / termSize) * 100
                    else:
                        termIdx = {termText: (1 / termSize) * 100}
                        termDict.update(termIdx)
            if not byTerms:
                freqMtx.update({docStr: termDict})
            pB.updateProgress()

        if saveMtx and byTerms:
            self.__saveMatrix(numDocs, freqMtx)

        if scattered and byTerms:
            freqMtx = self.__scatterMatrix(numDocs, freqMtx)

        # Close IndexReader
        reader.close()

        return freqMtx

    def GetSimilarity(self, query, freqMtx):
        """
        Cosine Similarity
        """
        qVector = {}
        qList = self.__stemString(query)
        for stem in qList:
            qVector.update({stem: 0})
        self.__normalize(qVector, freqMtx)

        qList = []
        #Get similarity between query and doc[n]
        for docIdx, dVector in freqMtx.items():
            dP = self.__dotProduct(qVector, dVector)
            qM = self.__magnitude(qVector)
            dM = self.__magnitude(dVector)
            cosSimilarity = dP / (qM * dM)
            qList.append((docIdx, cosSimilarity))

        return sorted(qList,
                      key=lambda similarity: similarity[1],
                      reverse=True)

    def AnalyzeDocument(self, docIdx):
        """
        Generates a list of (entity, relation, entity) tuples as its output.

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        gpeList = {}
        geolocator = Geocode()
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        # Load NLTK Data
        nltkPath = os.path.dirname(
            os.path.realpath(__file__)) + '/../tools/nltk_data'
        nltk.data.path.append(nltkPath)

        # Named Entity Recognition
        content = doc.get(Indexer.CONTENT)
        sentences = nltk.sent_tokenize(content)

        #ProgressBar
        print("Analazing Document {0}".format(docIdx))

        pB = ProgressBar(len(sentences), prefix='Progress:')
        # Loop over each sentence and tokenize it separately
        for sentence in sentences:
            ner = nltk.word_tokenize(sentence)
            ner = nltk.pos_tag(ner)
            ner = nltk.ne_chunk(ner)
            # Get all the Geo-Political Entities
            for subtrees in list(
                    ner.subtrees(
                        filter=lambda subtree: subtree.label() == 'GPE')):
                entityName = ' '.join([child[0] for child in subtrees])
                if entityName not in gpeList:
                    location = geolocator.GetGPE(entityName)
                    if location:
                        gpeList.update(location)
            pB.updateProgress()
        gpeList = geolocator.GetFeatureCollection(gpeList)

        return gpeList

    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content