Exemplo n.º 1
0
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e
Exemplo n.º 2
0
    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)
Exemplo n.º 3
0
    def indexDocs(self, root, iw):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for filename in os.listdir(root):
            if not filename.endswith(".txt"):
                print("file is not a txt file. we skip it.")
                continue
            print("adding", filename)
            path = os.path.join(root, filename)
            self.parseBook(path, t1, t2, iw)

        # Prints a set of statistics displaying missing data
        # Authorerror = number of authors not found
        # Titleerror = number of titles not found
        # Documenterror = number of documents where text could not be extracted so entire document was indexed
        print("AuthorError: {}".format(self.authorcount))
        print("TitleError: {}".format(self.titlecount))
        print("DocumentError: {}".format(self.errorcount))
        iw.close()
Exemplo n.º 4
0
	def index(self, root):

		t = FieldType()
		t.setIndexed(True)
		t.setStored(True)
		t.setTokenized(True)
		t.setStoreTermVectors(True)
		
		for path, dirs, files in os.walk(root):
			
			for file in files:
				
				filePath = os.path.join(path, file)
				fd = open(filePath)
				content = unicode(fd.read(), 'iso-8859-1')
				fd.close()
				
				doc = Document()
				doc.add(Field('name', file, StringField.TYPE_STORED))

				parent = os.path.split(path)[1]
				doc.add(Field('parent', parent, StringField.TYPE_STORED))

				if len(content) > 0:
					doc.add(Field('content', content, t))

				print 'Indexing %s' % file
				self.mWriter.addDocument(doc)

		self.mWriter.commit()
		self.mWriter.close()
Exemplo n.º 5
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPositions(True)
        t2.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

        file_path = root + 'r52-train-all-terms.txt'
        fd = open(file_path)
        contents = fd.readlines()
        fd.close()
        contents_list = [x.strip() for x in contents]
        for i in xrange(len(contents_list)):
            try:
                [topic, content] = contents_list[i].split('\t')
                doc = Document()
                doc.add(Field("id", str(i), t1))
                doc.add(Field("topic", topic, t1))
                doc.add(Field("contents", content, t2))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Exemplo n.º 6
0
def create_minidoc(termstring, field='text'):
    # To store term vectors (used for query expansion) we have to use a custom fieldtype
    customfield = FieldType()
    customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    customfield.setStored(True)
    customfield.setTokenized(True)
    customfield.setStoreTermVectors(True)

    doc = Document()
    doc.add(Field(field, termstring, customfield))
    return doc
Exemplo n.º 7
0
def main():
    INDEX_DIR = "full_index1"
    DOCUMENTS_DIR = "/media/joseph/Windows8_OS/Users/Joseph/AppData/Local/lxss/home/jwymbs23/data_science_projects/french_pamphlets/frc-data-master/OCR_text/"
    # Initialize lucene and JVM
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print("lucene version is:", lucene.VERSION, '\n')

    store = getStore(INDEX_DIR)

    analyzer = getAnalyzer()

    writer = getWriter(store=store, analyzer=analyzer, create=True)

    #get list of documents
    doc_list = getDoclist(DOCUMENTS_DIR)

    ftype = FieldType()
    ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    ftype.setTokenized(True)
    ftype.setStoreTermVectors(True)
    ftype.freeze()

    for cd, doc_name in enumerate(doc_list):
        if not cd % 1000:
            print(cd, '--', len(doc_list))
        with open(doc_name, 'r') as d:
            doc_lines = d.readlines()
            full_text = ''.join([i.strip() for i in doc_lines]).lower()
            try:
                # create a document that would we added to the index
                doc = Document()

                # Add fields to this document
                #could process fname here instead of in the dataframe later
                doc.add(
                    Field("identifier",
                          doc_name.split('/')[-1], TextField.TYPE_STORED)
                )  #Store.YES))#, Field.Index.ANALYZED))
                doc.add(
                    Field("vectext", full_text, ftype)
                )  #TextField.TYPE_STORED, TermVector.YES, ))#Store.YES))#, Field.Index.ANALYZED))
                doc.add(Field("text", full_text, TextField.TYPE_STORED))
                # Add the document to the index
                writer.addDocument(doc)
            except:
                print("Failed in indexDocs: ", doc_name)
    #writer.optimize()
    writer.commit()
Exemplo n.º 8
0
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setStored(False)
        ftype.setIndexed(True)
        ftype.setStoreTermVectors(True)
        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        
        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setStored(False)
        ftype.setIndexed(True)
        ftype.setStoreTermVectors(True)
        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
Exemplo n.º 10
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.json'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = json.load(file)
                    file.close()
                    doc = Document()
                    doc.add(Field("title", contents['title'], t2))
                    doc.add(Field("path", root, t1))
                    doc.add(Field("playStoreURL", contents['playStoreURL'],
                                  t1))
                    doc.add(Field("creator", contents['creator'], t1))
                    extendedInfo = contents['extendedInfo']
                    if len(extendedInfo['description']) > 0:
                        doc.add(
                            Field("description", extendedInfo['description'],
                                  t2))
                    else:
                        print "warning: no description in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
Exemplo n.º 11
0
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
Exemplo n.º 12
0
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
Exemplo n.º 13
0
 def index_docs(self, input_documents):
     for document in tqdm(input_documents, total=len(input_documents)):
         doc = Document()
         doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES))
         doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES))
         type = FieldType()
         type.setIndexOptions(
             IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
         type.setStored(True)
         type.setStoreTermVectors(True)
         type.setTokenized(True)
         if ".W" in document and ".M" in document:
             doc.add(
                 Field(
                     "text", " ".join(
                         tokenizer.tokenize(document[".M"].lower() + " " +
                                            document[".T"].lower() +
                                            document[".W"].lower())), type))
         elif ".M" in document and ".W" not in document:
             doc.add(
                 Field(
                     "text", " ".join(
                         tokenizer.tokenize(document[".M"].lower() + " " +
                                            document[".T"].lower())), type))
         elif ".M" not in document and ".W" in document:
             doc.add(
                 Field(
                     "text", " ".join(
                         tokenizer.tokenize(document[".T"].lower() +
                                            document[".W"].lower())), type))
         elif ".M" not in document and ".W" not in document:
             doc.add(
                 Field("text",
                       " ".join(tokenizer.tokenize(document[".T"].lower())),
                       type))
         if self.writer.getConfig().getOpenMode(
         ) == IndexWriterConfig.OpenMode.CREATE:
             self.writer.addDocument(doc)
         else:
             self.writer.updateDocument(Term(".U", document[".U"]), doc)
     self.writer.close()
Exemplo n.º 14
0
class LuceneDocumentField(object):
    """Internal handler class for possible field types"""

    def __init__(self):
        """Init possible field types"""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        else:
            raise Exception("Unknown field type")
Exemplo n.º 15
0
class LuceneDocumentField(object):
    """Internal handler class for possible field types"""
    def __init__(self):
        """Init possible field types"""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        else:
            raise Exception("Unknown field type")
Exemplo n.º 16
0
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import \
    IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)
Exemplo n.º 17
0
    # Open DB
    import sqlite3
    conn = sqlite3.connect(args.db)
    c = conn.cursor()

    # Field types
    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(False)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(True)
    t2.setStoreTermVectors(True)
    t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    t3 = FieldType()
    t3.setStored(True)
    t3.setTokenized(False)
    t3.setIndexOptions(IndexOptions.DOCS)

    # Fields to index (comments)
    fields = {
        'id': t1,
        'subreddit': t1,
        'subreddit_id': t1,
        'ups': t3,
        'author': t1,
        'name': t1,
Exemplo n.º 18
0
class Indexer:
    """
    Indexer Class
    """
    (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date",
                                                   "url", "tags", "timestamp")

    def __init__(self, indexDir="", debug=False, verbose=False):
        """
        :Parameters:
        - `indexDir`: Path where the Index will be saved. (Str)
        - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean)
        - `verbose`: Provide additional information about the initialization process. (Boolean)
        """
        self.__verbose = verbose
        if indexDir != "":
            INDEX_DIR = indexDir
        else:
            INDEX_DIR = os.path.dirname(
                os.path.realpath(__file__)) + "/luceneIndex"

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)
            self.__boAppend = False
        else:
            self.__boAppend = True
        # Initialize lucene and JVM
        lucene.initVM()
        # Get index storage
        if debug:
            # Store the index in memory
            self.__indexDir = RAMDirectory()
            self.__boAppend = False
            INDEX_DIR = "RAM Memory"
        else:
            # Store an index on disk
            self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR))

        # Create Content FieldType
        self.__contentType = FieldType()
        self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.__contentType.setTokenized(True)
        self.__contentType.setStored(True)
        self.__contentType.setStoreTermVectors(True)
        self.__contentType.setStoreTermVectorPositions(True)
        self.__contentType.freeze()

        # Get the Analyzer
        self.__analyzer = StandardAnalyzer(
            StandardAnalyzer.ENGLISH_STOP_WORDS_SET)

        # Print Indexer Information
        print("Lucene version is: ", lucene.VERSION)
        print("Index Directory: ", INDEX_DIR)

    def __del__(self):
        self.__indexDir.close()

    ##################################################
    #Private Methods
    ##################################################
    @staticmethod
    def __getTimestamp(dateTime):
        """
        Converts the document's date to an integer timestamp

        :Parameters:
        - `dateTime`: Document's date  (Str)

        :Returns:
        - Date timestamp (Int)
        """
        tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ')
        sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format(
            tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min,
            tm.tm_sec)
        return int(sTime)

    @staticmethod
    def __getDateTime(timeStamp):
        """
        Converts the document's timestamp to date

        :Parameters:
        - `timeStamp`: Document's timestamp

        :Returns:
        - Date (Str)
        """
        date = datetime.datetime(year=int(timeStamp[0:4]),
                                 month=int(timeStamp[4:6]),
                                 day=int(timeStamp[6:8]),
                                 hour=int(timeStamp[8:10]),
                                 minute=int(timeStamp[10:12]),
                                 second=int(timeStamp[12:14]))
        return date.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def __qualifyTags(tags):
        """
        Creates the qualify string for tags

        :Parameters:
        - `tags`: List of document's tags

        :Return:
        - Qualify Tags (Str)
        """
        sTags = ""
        for tag in tags:
            sTags += tag + '|'
        return sTags[:-1]

    @staticmethod
    def __scatterMatrix(numDocs, freqMtx):
        print("Scattering Frequency Matrix...")
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        matrix = []
        innerMatrix = ['Term']

        #Generate Document Columns
        for docIdx in range(numDocs):
            innerMatrix.append("D{0:0>4}".format(docIdx))
        matrix.append(innerMatrix)

        #Generate Word Rows and Columns
        for word in sorted(freqMtx):
            innerMatrix = []
            innerMatrix.append(word)
            for docIdx in range(numDocs):
                try:
                    termCount = round(freqMtx[word][str(docIdx)], 3)
                    innerMatrix.append(termCount)
                except KeyError:
                    innerMatrix.append(0)
            matrix.append(innerMatrix)
            pB.updateProgress()
        return matrix

    @staticmethod
    def __saveMatrix(numDocs, freqMtx):
        pathMatrix = os.path.dirname(
            os.path.realpath(__file__)) + "/freqMtx.txt"
        fMatrix = open(pathMatrix, 'w')

        print("Saving Frequency Matrix File: ", pathMatrix)
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        # File Generation Start
        print("+========= Frequency Matrix =========+", file=fMatrix)
        print("%20s" % (' '), end=' ', file=fMatrix)
        for docIdx in range(numDocs):
            print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix)
        print(file=fMatrix)
        for word in sorted(freqMtx):
            print("%20s" % (word), end=' ', file=fMatrix)
            for docIdx in range(numDocs):
                try:
                    termCount = freqMtx[word][str(docIdx)]
                    print("%02.03f" % (termCount), end=' ', file=fMatrix)
                except KeyError:
                    print("  0  ", end=' ', file=fMatrix)
            print(file=fMatrix)
            pB.updateProgress()
        # Close File
        fMatrix.close()

    def __stemString(self, stringToStem):
        stemmedTerms = []
        tknStream = self.__analyzer.tokenStream('STEM', stringToStem)
        stemmed = SnowballFilter(tknStream, "English")
        stemmed.reset()
        while stemmed.incrementToken():
            stemmedTerms.append(
                stemmed.getAttribute(CharTermAttribute.class_).toString())

        tknStream.close()
        return stemmedTerms

    @staticmethod
    def __normalize(qVector, freqMtx):
        for term in qVector:
            for docId in freqMtx:
                if (term in freqMtx[docId]) and (freqMtx[docId][term] >
                                                 qVector[term]):
                    qVector[term] = freqMtx[docId][term]

    @staticmethod
    def __dotProduct(aVector, bVector):
        """
        Calculate Dot Product

        :Parameters:
        - `aVector`: A Vector. (Dict)
        - `bVector`: B Vector. (Dict)

        :Returns:
        - Dot Product. (Int)
        """
        dotProduct = 0
        for term in aVector:
            if term in bVector:
                product = aVector[term] * bVector[term]
                dotProduct += product

        return dotProduct

    @staticmethod
    def __magnitude(vector):
        """
        Calculate Dot Product

        :Parameters:
        - `vector`: Query Vector. (Dict)

        :Returns:
        - Vector Magnitude. (Int)
        """
        # Magnitude of the vector is the square root of the dot product of the vector with itself.
        vectorMagnitude = Indexer.__dotProduct(vector, vector)
        vectorMagnitude = math.sqrt(vectorMagnitude)

        return vectorMagnitude

    ##################################################
    #Public Methods
    ##################################################
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

    def Search(self, query, field=NAME, maxResult=1000):
        """
        Search for a document into the Lucene's Index

        :Parameters:
        - `query`: Request to be made to the Index (Str).
        - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS).
        - `maxResult`: Maximum number of results.
        """
        # Get the Index Directory
        reader = DirectoryReader.open(self.__indexDir)
        searcher = IndexSearcher(reader)
        # Create a query
        queryParser = QueryParser(field, self.__analyzer).parse(query)
        # Do a search
        hits = searcher.search(queryParser, maxResult)
        print("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, queryParser))
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score))
            print("Name: " + doc.get('name'))
            print("Tags: " + doc.get('tags') + "\n")
        reader.close()

    def StemDocument(self, docIdx):
        """
        Return an array of the document's stemmed terms

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx).get(Indexer.CONTENT)
        reader.close()

        return self.__stemString(doc)

    def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False):
        """
        Generates a Frequency Matrix of the current Index

        :Parameters:
        - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean)
        """
        freqMtx = {}  # Terms - DocumentID Matrix
        reader = DirectoryReader.open(self.__indexDir)
        numDocs = reader.numDocs()
        print("Generating Frequency Matrix...")
        pB = ProgressBar(numDocs - 1, prefix='Progress:')
        for docIdx in range(numDocs):
            termItr = self.StemDocument(docIdx)
            termSize = len(termItr)
            docStr = '{0}'.format(docIdx)
            termDict = {}
            for termText in termItr:
                if byTerms:
                    # Check if the term exists
                    if termText in freqMtx:
                        # Check if the document exists
                        if docStr in freqMtx[termText]:
                            termCount = int(
                                math.ceil(
                                    ((freqMtx[termText][docStr] * termSize) /
                                     100)))
                            freqMtx[termText].update(
                                {docStr: ((termCount + 1) / termSize) * 100})
                        else:
                            freqMtx[termText].update(
                                {docStr: (1 / termSize) * 100})
                    else:
                        termIdx = {termText: {docStr: (1 / termSize) * 100}}
                        freqMtx.update(termIdx)
                else:
                    # Check if the term exists
                    termText = termText.replace('.', '_')
                    if termText in termDict:
                        termCount = int(
                            math.ceil((termDict[termText] * termSize) / 100))
                        termDict[termText] = ((termCount + 1) / termSize) * 100
                    else:
                        termIdx = {termText: (1 / termSize) * 100}
                        termDict.update(termIdx)
            if not byTerms:
                freqMtx.update({docStr: termDict})
            pB.updateProgress()

        if saveMtx and byTerms:
            self.__saveMatrix(numDocs, freqMtx)

        if scattered and byTerms:
            freqMtx = self.__scatterMatrix(numDocs, freqMtx)

        # Close IndexReader
        reader.close()

        return freqMtx

    def GetSimilarity(self, query, freqMtx):
        """
        Cosine Similarity
        """
        qVector = {}
        qList = self.__stemString(query)
        for stem in qList:
            qVector.update({stem: 0})
        self.__normalize(qVector, freqMtx)

        qList = []
        #Get similarity between query and doc[n]
        for docIdx, dVector in freqMtx.items():
            dP = self.__dotProduct(qVector, dVector)
            qM = self.__magnitude(qVector)
            dM = self.__magnitude(dVector)
            cosSimilarity = dP / (qM * dM)
            qList.append((docIdx, cosSimilarity))

        return sorted(qList,
                      key=lambda similarity: similarity[1],
                      reverse=True)

    def AnalyzeDocument(self, docIdx):
        """
        Generates a list of (entity, relation, entity) tuples as its output.

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        gpeList = {}
        geolocator = Geocode()
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        # Load NLTK Data
        nltkPath = os.path.dirname(
            os.path.realpath(__file__)) + '/../tools/nltk_data'
        nltk.data.path.append(nltkPath)

        # Named Entity Recognition
        content = doc.get(Indexer.CONTENT)
        sentences = nltk.sent_tokenize(content)

        #ProgressBar
        print("Analazing Document {0}".format(docIdx))

        pB = ProgressBar(len(sentences), prefix='Progress:')
        # Loop over each sentence and tokenize it separately
        for sentence in sentences:
            ner = nltk.word_tokenize(sentence)
            ner = nltk.pos_tag(ner)
            ner = nltk.ne_chunk(ner)
            # Get all the Geo-Political Entities
            for subtrees in list(
                    ner.subtrees(
                        filter=lambda subtree: subtree.label() == 'GPE')):
                entityName = ' '.join([child[0] for child in subtrees])
                if entityName not in gpeList:
                    location = geolocator.GetGPE(entityName)
                    if location:
                        gpeList.update(location)
            pB.updateProgress()
        gpeList = geolocator.GetFeatureCollection(gpeList)

        return gpeList

    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content
Exemplo n.º 19
0
class LuceneDocumentField(object):
    """Internal handler class for possible field types."""
    def __init__(self):
        """Init possible field types.
        DOCS
            Only documents are indexed: term frequencies and positions are omitted.
        DOCS_AND_FREQS
            Only documents and term frequencies are indexed: positions are omitted.
        DOCS_AND_FREQS_AND_POSITIONS
            Indexes documents, frequencies and positions.
        DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
            Indexes documents, frequencies, positions and offsets.
        NONE
            Not indexed
        """

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexOptions(IndexOptions.DOCS)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

        # FIELD_TEXT_NTV:  not stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_ntv = FieldType()
        self.field_text_ntv.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.field_text_ntv.setStored(False)
        self.field_text_ntv.setTokenized(True)
        self.field_text_ntv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: not stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_ntvp = FieldType()
        self.field_text_ntvp.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.field_text_ntvp.setStored(False)
        self.field_text_ntvp.setTokenized(True)
        self.field_text_ntvp.setStoreTermVectors(True)
        self.field_text_ntvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Gets Lucene FieldType object for the corresponding internal FIELDTYPE_ value."""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        elif type == Lucene.FIELDTYPE_TEXT_NTV:
            return self.field_text_ntv
        elif type == Lucene.FIELDTYPE_TEXT_NTVP:
            return self.field_text_ntvp
        else:
            raise Exception("Unknown field type")
Exemplo n.º 20
0
import lucene
# for customized field
from org.apache.lucene.document import Field
from org.apache.lucene.document import FieldType
from org.apache.lucene.index import IndexOptions

lucene.initVM(vmargs=['-Djava.awt.headless=true'])

CUSTOM_FIELD_TEXT=FieldType()
CUSTOM_FIELD_TEXT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
CUSTOM_FIELD_TEXT.setStored(True)
CUSTOM_FIELD_TEXT.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT.setStoreTermVectorPositions(True)
CUSTOM_FIELD_TEXT.setStoreTermVectorOffsets(True)
#CUSTOM_FIELD_TEXT.setStoreTermVectorPayloads(True)
CUSTOM_FIELD_TEXT.setTokenized(True)

CUSTOM_FIELD_TEXT_BF=FieldType()
CUSTOM_FIELD_TEXT_BF.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
CUSTOM_FIELD_TEXT_BF.setStored(False)
CUSTOM_FIELD_TEXT_BF.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT_BF.setStoreTermVectorPositions(True)
CUSTOM_FIELD_TEXT_BF.setStoreTermVectorOffsets(True)
CUSTOM_FIELD_TEXT_BF.setTokenized(True)

CUSTOM_FIELD_TEXT_DF=FieldType()
CUSTOM_FIELD_TEXT_DF.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
CUSTOM_FIELD_TEXT_DF.setStored(False)
CUSTOM_FIELD_TEXT_DF.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT_DF.setStoreTermVectorPositions(False)
CUSTOM_FIELD_TEXT_DF.setStoreTermVectorOffsets(False)
Exemplo n.º 21
0
def BuildSearchEngine(start, maxPages,domain,first):  
	#only initiate VM if it's the first being called
	if first == True:
		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
	print ('lucene'), lucene.VERSION
	if not os.path.exists("IndexFiles.index"):
		os.mkdir("IndexFiles.index")
	store = SimpleFSDirectory(Paths.get("IndexFiles.index"))
	config = IndexWriterConfig(StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET))
	#if first time being called, create new index, otherwise only append new pages into old index
	if first == True:
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
	else:
		config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
	writer = IndexWriter(store, config)
	
	#configure settings for pages being saved
	t1 = FieldType()
	t1.setStored(True)
	t1.setTokenized(False)
	t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
	t2 = FieldType()
	t2.setStored(False)
	t2.setTokenized(True)
	t2.setStoreTermVectors(True)
	t2.setStoreTermVectorOffsets(True)
	t2.setStoreTermVectorPositions(True)
	t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
	
	pagesToVisit = [start]
	hashtable=dict()
	hashtable[start]=1
	numberVisited = 0
	rp = robotparser.RobotFileParser()
	robotFileLocation = "http://www."+domain+"/robots.txt"
	rp.set_url(robotFileLocation)
	rp.read()
	# The main loop. Create a LinkParser and get all the links on the page.
	while numberVisited < maxPages and pagesToVisit != []:
			numberVisited = numberVisited +1
			# Start from the beginning of our collection of pages to visit:
			url = pagesToVisit[0]
			pagesToVisit = pagesToVisit[1:]
		try:
			print(numberVisited, "Visiting:", url)
			parser = LinkParser()
			data, links,hashtable = parser.getLinks(url,domain,hashtable,rp)
			# Add the pages that we visited to the end of our collection
			# of pages to visit:
			print(" **Success!**")
			path = "files/a.html"
			urllib.urlretrieve(url,path)
			file = open("files/a.html")
			contents = removeTag(file);
			file.close()
			file = open("files/a.html","w")
			file.write(contents)
			file.close()
			file = open("files/a.html")
			contents = file.read()
			file.close()
			doc = Document()
			doc.add(Field("name", "a.html", t1))
			doc.add(Field("path", "files", t1))
			#index the url
			doc.add(Field("url", url, t1))
			if len(contents) > 0:
				doc.add(Field("contents", contents.decode("utf-8").replace(u"\u2019","'").replace(u"\u2018","'").replace(u"\ufe0f","'").replace(u"\u20e3","'"), t2))
			else:
				print ("warning: no content in %s") % filename
			writer.addDocument(doc)
			pagesToVisit = pagesToVisit + links
		except Exception,e:
			print Exception,":",e
			print(" **Failed!**")
Exemplo n.º 22
0
Arquivo: tirza.py Projeto: komax/tirza
    writer = IndexWriter(directory, config)
    return writer

def open_searcher(writer):
    from org.apache.lucene.search import IndexSearcher
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return reader, searcher

from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import Term
vectorFieldType = FieldType(TextField.TYPE_NOT_STORED)
vectorFieldType.setIndexed(True)
vectorFieldType.setTokenized(True)
vectorFieldType.setStoreTermVectors(True)
vectorFieldType.setStoreTermVectorPositions(False)

writer = open_writer('data/index')

def addToIndex(lxmlNode):
    uri = xpathFirst(lxmlNode, '//oa:hasTarget/@rdf:resource')
    print uri
    seen = set()
    doc = Document()
    for fieldName in FIELD_NAMES:
        seen.clear()
        for subpath in [
            '', '/*/rdfs:label', '/*/skos:prefLabel', '/*/skos:altLabel',
            '/*/dcterms:title', '/*/foaf:name']:
            for value in xpath(lxmlNode, '//%(fieldName)s%(subpath)s/text()' % locals()):
Exemplo n.º 23
0
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import \
    IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = [
    "this bernhard is the text to be index text",
    "this claudia is the text to be indexed"
]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
Exemplo n.º 24
0
def create_index_from_folder(folder, index_file):
    """Lets Lucene create an index of all database files within a specified folder

    :param folder: absolute or relative path to database files
    :param index_file: absolute or relative output location for index

    Notes:
    - Does not go through database folder recursively, i.e. all files have to be at the root of the folder
    - Only CSV files are supported
    - Column headers are hardcoded and should follow:
        ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold
    """
    # Set up Lucene
    print()
    print("Starting Lucene ...")
    lucene.initVM()
    index_store = SimpleFSDirectory.open(File(index_file).toPath())
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    print()
    # Go through files, add rows of each as Documents to writer
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            print("Indexing {} ...".format(file), end=" ", flush=True)
            with open(os.path.join(folder, file), newline='') as db:
                reader = csv.reader(db)

                # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those
                post_ids = set()
                duplicate_counter = 0

                # To store term vectors (used for query expansion) we have to use a custom fieldtype
                customfield = FieldType()
                customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                customfield.setStored(True)
                customfield.setTokenized(True)
                customfield.setStoreTermVectors(True)

                # CSV files have a useless first row...
                skipfirst = True
                # ... and a useless first column. Skip both.
                for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader:
                    if skipfirst:
                        skipfirst = False
                        continue
                    doc = Document()

                    if rid in post_ids:
                        duplicate_counter += 1
                        continue  # skip
                    else:
                        post_ids.add(rid)

                    # Tokenize, index and store
                    doc.add(Field("text", text, customfield))

                    # Index and store
                    doc.add(StringField("id", rid, Field.Store.YES))
                    doc.add(
                        StringField("subreddit", subreddit, Field.Store.YES))
                    doc.add(StringField("meta", meta, Field.Store.YES))
                    doc.add(StringField("time", time, Field.Store.YES))
                    doc.add(StringField("author", author, Field.Store.YES))

                    # Store only
                    doc.add(StoredField("ups", ups))
                    doc.add(StoredField("downs", downs))
                    doc.add(StoredField("authorlinkkarma", authorlinkkarma))
                    doc.add(StoredField("authorkarma", authorkarma))
                    doc.add(StoredField("authorisgold", authorisgold))

                    writer.addDocument(doc)

            print("DONE!\t(Duplicate posts skipped: {})".format(
                duplicate_counter))

    writer.commit()
    writer.close()

    print()
    print("Finished indexing!")
Exemplo n.º 25
0
class IndexInterface(TextInterface):
    """
    Abstract class that takes care of serializing and deserializing text in an indexed structure
    This use lucene library

    Args:
        directory (str): Path of the directory where the content will be serialized
    """
    def __init__(self, directory: str):
        super().__init__(directory)
        self.__doc = None
        self.__writer = None
        self.__field_type_frequency = None
        self.__field_type_searching = None

    def __str__(self):
        return "IndexInterface"

    def init_writing(self):
        self.__field_type_searching = FieldType(TextField.TYPE_STORED)
        self.__field_type_frequency = FieldType(StringField.TYPE_STORED)
        self.__field_type_frequency.setStored(True)
        self.__field_type_frequency.setTokenized(False)
        self.__field_type_frequency.setStoreTermVectors(True)
        self.__field_type_frequency.setStoreTermVectorPositions(True)
        self.__field_type_frequency.\
            setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
        fs_directory = SimpleFSDirectory(Paths.get(self.directory))
        self.__writer = IndexWriter(fs_directory, IndexWriterConfig())

    def new_content(self):
        """
        In the lucene index case the new content
        is a new document in the index
        """
        self.__doc = Document()

    def new_field(self, field_name: str, field_data):
        """
        Add a new field

        Args:
            field_name (str): Name of the new field
            field_data: Data to put into the field
        """
        if isinstance(field_data, list):
            for word in field_data:
                self.__doc.add(
                    Field(field_name, word, self.__field_type_frequency))
        else:
            self.__doc.add(
                Field(field_name, field_data, self.__field_type_frequency))

    def new_searching_field(self, field_name, field_data):
        """
        Add a new searching field. It will be used by the search engine recommender

        Args:
            field_name (str): Name of the new field
            field_data: Data to put into the field
        """
        self.__doc.add(
            Field(field_name, field_data, self.__field_type_searching))

    def serialize_content(self):
        """
        Serialize the content
        """
        doc_index = self.__writer.addDocument(self.__doc)
        return doc_index - 1

    def stop_writing(self):
        """
        Stop the index writer and commit the operations
        """
        self.__writer.commit()
        self.__writer.close()

    def get_tf_idf(self, field_name: str, content_id: str):
        """
        Calculates the tf-idf for the words contained in the field of the content whose id
        is content_id

        Args:
            field_name (str): Name of the field containing the words for which calculate the tf-idf
            content_id (str): Id of the content that contains the specified field

        Returns:
             words_bag (Dict <str, float>):
             Dictionary whose keys are the words contained in the field,
             and the corresponding values are the tf-idf values.
        """
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory))))
        query = QueryParser("testo_libero",
                            KeywordAnalyzer()).parse("content_id:\"" +
                                                     content_id + "\"")
        score_docs = searcher.search(query, 1).scoreDocs
        document_offset = -1
        for score_doc in score_docs:
            document_offset = score_doc.doc

        reader = searcher.getIndexReader()
        words_bag = {}
        term_vector = reader.getTermVector(document_offset, field_name)
        term_enum = term_vector.iterator()
        for term in BytesRefIterator.cast_(term_enum):
            term_text = term.utf8ToString()
            postings = term_enum.postings(None)
            postings.nextDoc()
            term_frequency = 1 + math.log10(
                postings.freq())  # normalized term frequency
            inverse_document_frequency = math.log10(
                reader.maxDoc() / reader.docFreq(Term(field_name, term)))
            tf_idf = term_frequency * inverse_document_frequency
            words_bag[term_text] = tf_idf

        reader.close()
        return words_bag

    def delete_index(self):
        shutil.rmtree(self.directory, ignore_errors=True)
    def tweetIndexer(self, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        x = 0
        for i in range(0,500):
            if not os.path.isfile("json/tweets-" + str(i) + ".json"):
                break

            print "adding tweets-" + str(i) + ".json"
            tweets = open("json/tweets-" + str(i) + ".json", "r")

            for line in tweets.readlines():
                tweet = json.loads(line)
                if 'limit' in tweet:
                    continue
                try:
                    doc = Document()
                    doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
                    sname = tweet['user']['screen_name']
                    tid = str(tweet['id'])
                    text = tweet['text']
                    uname = tweet['user']['name']
                    created = tweet['created_at']
                    tstamp = tweet['timestamp_ms']
                    place = ""
                    if tweet['place']:
                        place = tweet['place']['full_name'] + ", " + tweet['place']['country']
                    lat = ""
                    lng = ""
                    titles = ""
                    urls = ""
                    exist = "false"

                    if tweet['coordinates']:
                        lat = str(tweet['coordinates']['coordinates'][1])
                        lng = str(tweet['coordinates']['coordinates'][0])
                    else:
                        lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
                        lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
                    
                    if len(tweet['entities']['urls']) != 0:
                        exist = "true"
                        for index in range(len(tweet['entities']['urls'])):
                            title = tweet['entities']['urls'][index]['url_title']
                            if title == None:
                                titles += ",-"
                            else:
                                title = title.encode('ascii','ignore')
                                titles += "," + str(title)
                            urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])


                    searchable = text + " " + urls + " " + uname + " " + sname + " " + place
                    doc.add(Field("lookup", searchable, t2))
                    doc.add(Field("text", text, t2))
                    doc.add(Field("user_name", uname, t2)) 
                    doc.add(Field("screen_name", sname, t2))                    
                    doc.add(Field("tweet_id", tid, t2))
                    doc.add(Field("created_at", created, t2))
                    doc.add(Field("geo_lat", lat, t2))
                    doc.add(Field("geo_lng", lng, t2))
                    doc.add(Field("url_exist", exist, t2))
                    doc.add(Field("url_url", urls, t2))
                    doc.add(Field("url_title", titles, t2))
                    doc.add(Field("timestamp", tstamp, t2))
                    writer.addDocument(doc)
                    x += 1
                except Exception, e:
                    pass
            tweets.close()
Exemplo n.º 27
0
class IndexFiles:
    def __init__(self, path, analyzer):
        self.path = path
        self._analyzer = analyzer
        self.errors = []
        self._initialize()

    def index(self, csvs_path):

        all_csvs = [x for x in os.listdir(csvs_path) if x.endswith('csv')]

        for i, csv_file in enumerate(all_csvs, 1):
            print("\nProcessing CSV #{}".format(i), flush=True)

            patents = self._read_csv(csvs_path + "/" + csv_file)
            if patents is None:
                continue
            print("\rProcessed {}/{} patents in file".format(0, len(patents)),
                  end='',
                  flush=True)
            for j, patent in enumerate(patents, 1):

                pid, date, title, author, icn, org, acn, abstract, description, purpose, mechanics, uid = patent

                try:
                    doc = Document()
                    doc.add(Field('id', pid, self._ft1))
                    doc.add(Field('date', date, self._ft1))
                    doc.add(Field('title', title, self._ft2))
                    doc.add(Field('author', author, self._ft1))
                    doc.add(Field('icn', icn, self._ft1))
                    doc.add(Field('organization', org, self._ft1))
                    doc.add(Field('acn', acn, self._ft1))
                    doc.add(Field('abstract', abstract, self._ft2))
                    doc.add(Field('description', description, self._ft2))
                    doc.add(Field('purpose', purpose, self._ft2))
                    doc.add(Field('mechanics', mechanics, self._ft2))
                    doc.add(Field('uid', uid, self._ft1))

                    self._writer.addDocument(doc)

                except Exception as e:
                    print("\nFailed to index '{}': {}\n".format(csvs_path, e))
                print("\rProcessed {}/{} patents in file".format(
                    j, len(patents)),
                      end='',
                      flush=True)
            print()
        self._commit()
        return self

    def _read_csv(self, path):

        try:
            with open(path, 'rU', newline='') as fs:
                reader = csv.reader(x.replace('\0', '') for x in fs)
                rows = [r for r in reader]

            return rows
        except Exception as e:
            print("\nError reading file '{}' : {} \n".format(path, e))
            return None

    def _commit(self):

        ticker = Ticker()
        print("Commiting index", end='', flush=True)
        threading.Thread(target=ticker.run).start()
        self._writer.commit()
        self._writer.close()
        ticker.tick = False
        print("Done!")

    def _initialize(self):

        if not os.path.exists(self.path):
            os.mkdir(self.path)

        self._analyzer = LimitTokenCountAnalyzer(self._analyzer, 1048576)
        self._store = SimpleFSDirectory(Paths.get(self.path))
        self._config = IndexWriterConfig(self._analyzer)
        self._config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self._writer = IndexWriter(self._store, self._config)
        self._set_fieldtypes()

    def _set_fieldtypes(self):

        self._ft1 = FieldType()
        self._ft1.setStored(True)
        self._ft1.setTokenized(False)
        self._ft1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self._ft2 = FieldType()
        self._ft2.setStored(True)
        self._ft2.setTokenized(True)
        self._ft1.setStoreTermVectors(True)
        self._ft1.setStoreTermVectorOffsets(True)
        self._ft1.setStoreTermVectorPositions(True)
        self._ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)