예제 #1
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
예제 #2
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #3
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (
        n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #4
0
def wikipedia_indexer(storage, wikipedia_file):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    f = open(wikipedia_file)

    for i, line in enumerate(f):
        text = line.strip().decode('utf-8').split('\t')
        title = text[0]
        if 'disambigu' in text[0] or len(text) < 2:
            continue
        text = text[1]
        doc = Document()
        doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
        writer.addDocument(doc)
        if writer.numDocs() % 1000 == 0:
            print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)

    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #5
0
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #6
0
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
예제 #7
0
def index_files():
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
    INPUT_DIR = BASE_DIR + "/input/"
    INDEX_DIR = BASE_DIR + "/lucene_index/"

    NoT = 100000  # Number of Tokens
    print "------------------------------------------------------"
    print "PyLucene Demo started (lucene_demo.py)"
    print "Python version: %d.%d.%d" % (
        sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
    print 'Lucene version:', lucene.VERSION
    print "------------------------------------------------------\n"
    # lucene.initVM()

    # directory = RAMDirectory()
    index_path = Paths.get(INDEX_DIR)
    directory = SimpleFSDirectory(index_path)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    print "Number of indexed documents: %d\n" % writer.numDocs()
    for input_file in listdir(INPUT_DIR):  # iterate over all input files
        print "Current file:", input_file
        if input_file.endswith(".json"):
            with open(INPUT_DIR + input_file) as f:
                for line in f:
                    # doc = create_document(line, input_file) # call the create_document function
                    o = json.loads(line)
                    doc = Document()  # create a new document
                    doc.add(TextField("filename", input_file, Field.Store.YES))
                    # print file
                    doc.add(
                        TextField("username", o['user']['screen_name'],
                                  Field.Store.YES))
                    # print "username: "******"text", o['text'], Field.Store.YES))
                    # print "text: " + o['text']
                    if o['user']['location']:
                        doc.add(
                            TextField("location", o['user']['location'],
                                      Field.Store.YES))
                        # print "location: " + o['user']['location']
                    doc.add(TextField("time", o['created_at'],
                                      Field.Store.YES))
                    writer.addDocument(
                        doc)  # add the document to the IndexWriter
    print "\nNumber of indexed documents: %d" % writer.numDocs()
    writer.close()
    print "Finished\n"
    print "-----------------------------------------------------"
예제 #8
0
def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy
class LuceneIndexer:

    def __init__(self, path_to_save):
        self.path_to_save = path_to_save
        self.num_docs = 0
        lucene.initVM()
        self.indexDir = SimpleFSDirectory(File(self.path_to_save))
        self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
        self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
        self.writer = IndexWriter(self.indexDir, self.writerConfig)

    def add_document(self, fields, header, id_):
        doc = Document()
        if len(fields) > len(header):
            sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
            for field in fields:
                sys.stderr.write('%s\n' % field)
            return
        for idx, field in enumerate(fields):
            fname, fieldtype = header[idx]
            if fieldtype is IntField:
                field = int(field)
            doc.add(fieldtype(fname, field, Field.Store.YES))
        self.writer.addDocument(doc)
        self.num_docs += 1

    def close(self):
        print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
        self.writer.close()
예제 #10
0
def index():
    # Initialize lucene and the JVM
    #    lucene.initVM()
    GLOBALDIRECTORY = getDirectory()

    #Indexwriter config
    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, tokenCount)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(GLOBALDIRECTORY, config)

    fileNames = getTxtFile(textfileDirectory)  #creates document for each tweet
    fileNames = getTxtFile(textfileDirectory)  #creates document for each tweet
    for file in fileNames:
        data = getData(file)

        for tweets in data:
            if 'text' in tweets:
                doc = createDocument_tweet(tweets)
                writer.addDocument(doc)  # add the document to  IndexWriter

        print file
    print "\nNumber of indexed documents: %d" % writer.numDocs(
    )  #number of documents indexed for testing
    writer.close()
    print "Indexing done!\n"
    print "------------------------------------------------------"
    return GLOBALDIRECTORY
예제 #11
0
파일: somali.py 프로젝트: hzatarain/somali
def lucene_indexing():
    lucene.initVM()
    index_dir = os.getcwd()
    dir = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_48)
    index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer);
    index_writer = IndexWriter(dir, index_writer_config)

    for tfile in glob.glob(os.path.join(index_dir, '*.txt')):
        print "Indexing: ", tfile
        document = Document()
        with open(tfile, 'r') as f:
            content = f.read()
        document.add(Field("text", content, Field.Store.YES,
                           Field.Index.ANALYZED))
        document.add(Field("title", tfile, Field.Store.YES,
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
    print index_writer.numDocs()
    index_writer.close()
예제 #12
0
def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
예제 #13
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
예제 #14
0
파일: syntax.py 프로젝트: zoudajia/rencos
def build_index(file_dir):
    indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))
    config = IndexWriterConfig(WhitespaceAnalyzer())
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)

    # t1 = FieldType()
    # t1.setStored(True)
    # t1.setTokenized(False)
    # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    #
    # t2 = FieldType()
    # t2.setStored(True)
    # t2.setTokenized(True)
    # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    print("%d docs in index" % writer.numDocs())
    if writer.numDocs():
        print("Index already built.")
        return
    with open(file_dir + "/train/train.ast.src") as fc:

        codes = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in fc.readlines()
        ]

    for k, code in enumerate(codes):
        doc = Document()
        doc.add(StoredField("id", str(k)))
        doc.add(TextField("code", code, Field.Store.YES))

        writer.addDocument(doc)

    print("Closing index of %d docs..." % writer.numDocs())
    writer.close()
예제 #15
0
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()
예제 #16
0
    def index(self):
        if not (os.path.exists(self._dataDir)
                and os.path.isdir(self._dataDir)):
            raise IOError, "%s isn't existed or is not a directory" % (
                self._dataDir)

        dir = SimpleFSDirectory(Paths.get(self._indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)
        self.indexDirectory(writer, self._dataDir)
        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
예제 #17
0
def indexer():
    '''索引器'''
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT,
                                     StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    movies = MovieInfo.query.limit(10000).all()
    print("Index starting...")
    for n, l in enumerate(movies):
        doc = Document()
        doc.add(Field("name", l.name, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(
            Field("shortcut", l.shortcut, Field.Store.YES,
                  Field.Index.ANALYZED))
        doc.add(Field('url', l.url, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        print("Item {} indexed...".format(n + 1))
    print("Index finished...")
    print("Closing index of %d docs..." % writer.numDocs())
    writer.close()
def main():
    """Function to index negative situations and retrive based on input sentence"""

    all_sent_df = pd.read_csv("../data/sentiment_data.csv")
    neg = all_sent_df[all_sent_df["label"] == 1]
    all_neg_phrases = list(neg["phrase"])
    with open("../data/negSituations.txt", "r") as fpointer:
        all_neg_situations = fpointer.readlines()

    all_neg_situations = map(lambda s: s.strip(), all_neg_situations)
    all_neg_phrases = map(lambda s: s.strip(), all_neg_phrases)

    lucene.initVM()
    analyzer = StandardAnalyzer()
    path = Paths.get('negSituationIndex')
    directory = SimpleFSDirectory(path)
    writer_config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, writer_config)

    print(writer.numDocs())
    # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS
    for each in all_neg_situations:
        document = Document()
        document.add(Field("negativeSituations", each, TextField.TYPE_STORED))
        writer.addDocument(document)

    print(writer.numDocs())
    writer.close()

    analyzer = StandardAnalyzer()
    reader = DirectoryReader.open(directory)
    searcher = IndexSearcher(reader)

    # QUERYING FOR A QUESTION
    with open("../data/negative_situation_to_retrieve.txt", "r") as fpointer:
        all_test_sent = fpointer.readlines()
    all_test_sent = map(lambda s: s.strip(), all_test_sent)

    query_parser = QueryParser("negativeSituations", analyzer)

    total_num = 0
    tic = time.time()
    all_ans = []
    for each in all_test_sent:
        total_num = total_num + 1
        if total_num % 1000 == 0:
            print(total_num, time.time() - tic)

        query = query_parser.parse(query_parser.escape(each))
        hits = searcher.search(query, 3)
        docs_scores = [hit.score for hit in hits.scoreDocs]
        current_ans = []
        if docs_scores != []:
            for hit in hits.scoreDocs:
                doc_t = searcher.doc(hit.doc)
                doc_text = doc_t.get("negativeSituations")
                current_ans.append(doc_text)
        else:
            continue

        current_ans = list(set(current_ans))
        all_ans.append(current_ans)

    print(all_ans)
예제 #19
0
class Indexer(object):
	# Creates index adds it to docs
	# indexDir Directory is where the index is created
	def __init__(self, indexDir):
		f = Paths.get(indexDir)
		self._dir = SimpleFSDirectory(f)
		analyzer = StandardAnalyzer()
		config = IndexWriterConfig(analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		self._writer = IndexWriter(self._dir, config)
		
	def close(self):
		self._writer.close()

	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return

	def indexFile(self, file):
		if ( self.getDoc(file) is not None ):
			self._writer.addDocument(self.getDoc(file))
	#pass in absolute path when calling this function
	def createIndex(self, path):
		for file in os.listdir(path):
			print(file)
			if os.path.isfile(path+"/"+file):
				self.indexFile(file)
		return self._writer.numDocs()
	def closeWriter(self):
		self._writer.close()
예제 #20
0
    

    lucene.initVM()
  
    print "lucene version is:", lucene.VERSION
    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Get index storage
    indexDir = SimpleFSDirectory(File("index/"))

    # Get index writer
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
    writer = IndexWriter(indexDir, config);

    print "%d docs in index" % writer.numDocs()

    for d in data:
        rec = d['record']
        if not rec['product_name'] or not rec['uniq_id']:
            logging.info ("Incomplete product ... skipping")
            logging.debug(rec)
            continue
        else:
            doc = Document()
            for k,v in rec.iteritems():
                if k in keys:
                    doc.add(Field(k, v, Field.Store.YES, Field.Index.ANALYZED))
                else:
                    if (k == 'product_specifications'):
                        specs = v['product_specification']
예제 #21
0
trainingFilePath = '/home/tarun/PE/Dataset/training_set.tsv'

lucene.initVM()

# ANALYZER
analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT) 

# DIRECTORY
directory = SimpleFSDirectory(File(luceneIndexPath))


# INDEX WRITER
writerConfig = IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer) 
writer = IndexWriter(directory, writerConfig)

print writer.numDocs()
# INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS
for fileName in os.listdir(corpus):
	print fileName
	document = Document()
	article = os.path.join(corpus, fileName)
	content = open(article, 'r').read()
	document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(document)
print writer.numDocs()
writer.close()

# INDEX READER
reader = IndexReader.open(directory)
searcher = IndexSearcher(reader)
예제 #22
0
    del line[0:2]
    line = ' '.join(line)
    qterm = keyterm.replace("_", " ")
    if qterm not in line:
        line = qterm + ' ' + line
    doc.add(TextField("text", line, Field.Store.YES))
    return doc


lucene.initVM()
index_path = File(INDEX_DIR).toPath()
directory = SimpleFSDirectory.open(index_path)
analyzer = StandardAnalyzer()
config = IndexWriterConfig(analyzer)
writer = IndexWriter(directory, config)
print("Number of documents:", writer.numDocs())

for input_file in listdir(INPUT_DIR):
    print("Current file:", input_file)
    if input_file.endswith(".txt"):
        path = INPUT_DIR + input_file
        with open(path) as file:
            line = file.readline()
            while (line):
                line = file.readline()
                if len(line.strip()) != 0:
                    doc = create_document(line)
                    writer.addDocument(doc)
        file.close()
print("finally:", writer.numDocs())
print("Indexing done!")
예제 #23
0
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    analyzer_ws = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
    std_path = "%s/lucene_full_standard/" % (output_path)
    ws_path = "%s/lucene_full_ws/" % (output_path)
    if os.path.exists(std_path):
        os.remove(std_path)
    if os.path.exists(ws_path):
        os.remove(ws_path)
    indexDir1 = SimpleFSDirectory(File(std_path))
    indexDir2 = SimpleFSDirectory(File(ws_path))
    writerConfig1 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writerConfig2 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer_ws)
    writer1 = IndexWriter(indexDir1, writerConfig1)
    writer2 = IndexWriter(indexDir2, writerConfig2)

    print "%d docs in index1" % writer1.numDocs()
    print "%d docs in index2" % writer2.numDocs()
    print "Reading lines from sys.stdin..."

    ftypes = open(LUCENE_TYPES_FILE, "w")

    for n, l in enumerate(sys.stdin):
        doc = Document()
        doc_lc = Document()
        fields = l.rstrip().split("\t")
        all_ = []
        if n == 0:
            sys.stdout.write("TYPES_HEADER")
        elif n == 1:
            sys.stdout.write("\n")
        for (idx, field) in enumerate(fields):
예제 #24
0
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.standard import StandardAnalyzer

if __name__ == "__main__":
    lucene.initVM()
    path = Paths.get('index')
    indexDir = SimpleFSDirectory(path)
    analyzer = StandardAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from sys.stdin..."
    todo = get_all_rawtext_ids()
    for n, i in enumerate(todo):
        try:
            html = get_rawtext_by_id(i).html
            root = LH.fromstring(html)
            text = root.text_content().strip()
        except:
            #print "Failed to parse doc"
            continue
        doc = Document()
        # print text
        doc.add(TextField("text", text, Field.Store.NO))
        doc.add(StoredField("id", i))
        writer.addDocument(doc)
예제 #25
0
파일: Indexer.py 프로젝트: dongyangli/cs246
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
	writer = IndexWriter(indexDir, writerConfig)

	try:
		con = mdb.connect('localhost', 'root', '', 'cs246')
		cur = con.cursor()
		cur.execute("SELECT * FROM article_page;")
		rows = cur.fetchall()
		n = 0
		for row in rows:
			n = n+1
			page_id = str(row[0])
			page_title = str(row[1]).replace('_', ' ')

			doc = Document()
			doc.add(Field("title", page_title, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS))
			doc.add(Field("id", page_id, Field.Store.YES, Field.Index.NO))
			writer.addDocument(doc)
		print "total number of tuples", n
	except mdb.Error, e:
		print "Error %d: %s" % (e.args[0],e.args[1])
		sys.exit(1)
	finally:
		if con:    
			con.close()

	print "Created (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()

예제 #26
0
class IndexFiles(object):
    def __init__(self, indexDir):
        if not os.path.exists(indexDir):
            os.mkdir(indexDir)

        store = SimpleFSDirectory(File(indexDir))

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)

        self.writer = IndexWriter(store, config)

    def index(self, file, duplicates):
        exact = [
            duplicate['duplicate'] for duplicate in duplicates
            if duplicate['sim'] == 1
        ]
        near = [
            duplicate['duplicate'] for duplicate in duplicates
            if duplicate['sim'] < 1
        ]

        with open(file) as file:
            for document in file:
                data = json.loads(document)
                if (data['url'] in exact):
                    continue

                doc = self.createDoc(data['url'], data['html'], data['url']
                                     in near)
                self.writer.addDocument(doc)
                store_outlinks(data['url'], data['outlinks'])

        self.writer.commit()

        return self.writer.numDocs()

    def createDoc(self, url, html, duplicate):
        title, contents = self.parseHtml(url, html)

        doc = Document()
        doc.add(StringField("title", title, Field.Store.YES))
        doc.add(StringField("url", url, Field.Store.YES))
        doc.add(
            StringField("duplicate",
                        str(duplicate).lower(), Field.Store.YES))

        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print "Warning: No content in %s" % url

        return doc

    def close(self):
        self.writer.close()

    def parseHtml(self, url, html):
        soup = BeautifulSoup(html, 'lxml')
        title = self.getTitle(url, soup)
        body = self.getBody(soup)

        return title, body

    def getTitle(self, url, soup):
        if soup.title:
            title = soup.title.get_text().strip()
        elif soup.find("h1"):
            title = " ".join(soup.find("h1").get_text().split())
        else:
            title = url.split("/")[-1]

        return title

    def getBody(self, soup):
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]
        [style.decompose() for style in soup.find_all('style')]
        [script.decompose() for script in soup.find_all('script')]

        if soup.body:
            return soup.body.get_text(" ", strip=True)
        else:
            return soup.get_text(" ", strip=True)
예제 #27
0
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, IntField, StringField, TextField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField}

 
if __name__ == "__main__":
  lucene.initVM()
  indexDir = SimpleFSDirectory(File("data/lucene_full_v1/"))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)
 
  print "%d docs in index" % writer.numDocs()
  print "Reading lines from sys.stdin..."
  header=[]
  for n, l in enumerate(sys.stdin):
    doc = Document()
    fields = l.rstrip().split("\t")
    #add one more field to header field set, which will index the concatenated set of all fields for general searches
    all_ = []
    if len(fields) < 1 or len(fields[0]) == 0:
        continue
    for (idx,field) in enumerate(fields):
        if n == 0:
            typechar = field[-1]
            if typechar not in set(['t','s','i']):
                sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field))
                exit(-1) 
예제 #28
0
class QuestionLuceneSearch():

    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()

    def get_text_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        text_id = {}
        id_text = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            text = doc['text']
            text_id[text] = idd
            id_text[idd] = text

        return text_id, id_text


    # def add_doc(self, doc_id, title, txt, add_terms):
    def add_doc(self, doc_id, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        # doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)


    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()


    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)
 
        return out


    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c

    
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
                except:
                    print('Unexpected error when processing query:', str(q))
                    print('Using query "dummy".')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = list(map(int, doc['word_idx'].split(' ')))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    # c[int(doc['id'])] = [word_idx, word]
                    c[int(doc['id'])] = [word_idx, word, hit.score]
                out.append(c)

        return out


    def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in zip(out, terms):                
                for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())):
                    outt[cand_id] = term
  
        if save_cache:
            for q, c in zip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out



    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out

    def search_pair_score_singlethread(self, q, doc_int, searcher):

        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = searcher.explain(query, doc_int)
        c[1] = exp

        out.append(c)

        return out

    def search_pair_score_multithread(self, qs_trailing_doc, searcher):

        self.curr_searcher = searcher
        # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int))
        out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc)

        return out

    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c
예제 #29
0
def main(indexDir, inputDir):
	"""Creates a Lucene Index, and indexes every .json file it finds.
	It utilizes a stopwords.txt to filter out stop words"""
	lucene.initVM()

	logger.info("Loading stop words from stopwords.txt")
	f = open('stopwords.txt', 'r')
	stopwords = set([])
	for line in f:
		stopwords.add(line.strip())
	f.close()
	logger.debug('Stop words: %s' % str(stopwords))
	temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)

	for stopword in stopwords:
		temp.add(stopword)

	stopwords = temp

	# Create index
	logger.info("Creating Lucene index [%s]..." % indexDir)

	dir = SimpleFSDirectory(File(indexDir))
	analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
	writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
	writer = IndexWriter(dir, writerConfig)

	logger.info("Currently there are %d documents in the index..." % writer.numDocs())

	# Index documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	for f in onlyfiles:
		try:
			journal_code = f.split('.')[0]
			f = join(inputDir, f)
			json_data = open(f)
			data = json.load(json_data)
			for entry in data:
				doc = Document()
				doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
				doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			json_data.close()
		except (IOError) as v:
			try:
				(code, message) = v
			except:
				code = 0
				message = v
			logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
	logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())

	# Wrap it up
	#logger.info("About to optimize index of %d documents..." % writer.numDocs())
	#writer.optimize()
	#logger.info("...done optimizing index of %d documents" % writer.numDocs())

	logger.info("Closing index of %d documents..." % writer.numDocs())
	writer.close()

	reader = IndexReader.open(dir)
	with open('all.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for i in xrange(0, reader.numDocs()):
			doc = reader.document(i)
			csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
				doc.get('title').strip().replace(',', '\,').encode('utf8')])
예제 #30
0
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.util import Version
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, FieldType
if __name__ == "__main__":
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    indexDir = "../pyFreya/freya/index/actual"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(dir, config)
    with open("data",'r') as f:
        for doc in f.read().split("newDocSep"):
            docr = Document()
            for field in doc.split("csvSep"):
                fieldData = field.split("||")
                try:docr.add(Field(fieldData[1], fieldData[2], Field.Store.YES, Field.Index.ANALYZED))
                except:print "ups"
            print "\n"
            writer.addDocument(docr)
    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.commit()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    print >> sys.stderr, "...done closing index of %d documents" % writer.numDocs()
    writer.close()
예제 #31
0
class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab,
                                               prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out
예제 #32
0
class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

        if prm.idf_path:
            print 'Loading IDF dictionary...'
            self.idf = pkl.load(open(prm.idf_path))

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_idf(self, txt):
        txt = utils.clean(txt)
        txt = txt.lower()
        df = set()
        for word in wordpunct_tokenize(txt):
            if word not in df:
                df.add(word)
                self.idf[word] += 1.

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out
예제 #33
0
def main(index_dir, input_dir):
    """Creates a Lucene Index, and indexes every .json file it finds.
    It utilizes a stopwords.txt to filter out stop words"""
    lucene.initVM()

    logger.info("Loading stop words from stopwords.txt")
    f = open('stopwords.txt', 'r')
    stopwords = set([])
    for line in f:
        stopwords.add(line.strip())
    f.close()
    logger.debug('Stop words: %s' % str(stopwords))
    temp = CharArraySet(1, True)

    for stopword in stopwords:
        temp.add(stopword)

    stopwords = temp

    # Create index
    logger.info("Creating Lucene index [%s]..." % index_dir)

    fs_dir = SimpleFSDirectory(Paths.get(index_dir))
    analyzer = StandardAnalyzer(stopwords)
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(fs_dir, writerConfig)

    logger.info("Currently there are %d documents in the index..." %
                writer.numDocs())

    # Index documents
    onlyfiles = [
        f for f in listdir(input_dir)
        if isfile(join(input_dir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        try:
            journal_code = f.split('.')[0]
            f = join(input_dir, f)
            json_data = open(f)
            data = json.load(json_data)
            for entry in data:
                doc = Document()
                doc.add(StringField("journal", journal_code, Field.Store.YES))
                doc.add(StringField("url", entry['url'], Field.Store.YES))
                doc.add(StringField("date", entry['date'], Field.Store.YES))
                doc.add(TextField("title", entry['title'], Field.Store.YES))
                writer.addDocument(doc)
            json_data.close()
        except IOError as v:
            try:
                (code, message) = v
            except (TypeError, ValueError):
                code = 0
                message = v
            logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
    logger.info("Indexed lines from stdin (%d documents in index)" %
                writer.numDocs())

    # Wrap it up
    # logger.info("About to optimize index of %d documents..." % writer.numDocs())
    # writer.optimize()
    # logger.info("...done optimizing index of %d documents" % writer.numDocs())

    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()

    reader = DirectoryReader.open(fs_dir)
    with open('all.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_ALL)
        for i in range(0, reader.numDocs()):
            doc = reader.document(i)
            csvwriter.writerow([
                doc.get('journal'),
                doc.get('date'),
                doc.get('url'),
                doc.get('title').strip().replace(',', '\,')
            ])
예제 #34
0
    return doc


# Initialize lucene and the JVM
lucene.initVM()

# Create a new directory. As a SimpleFSDirectory is rather slow ...
directory = RAMDirectory()  # ... we'll use a RAMDirectory!

# Get and configure an IndexWriter
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(directory, config)

print "Number of indexed documents: %d\n" % writer.numDocs()
for input_file in listdir(INPUT_DIR):  # iterate over all input files
    print "Current file:", input_file
    if input_file.endswith(".txt"):  # consider only .txt files
        doc = create_document(input_file)  # call the create_document function
        writer.addDocument(doc)  # add the document to the IndexWriter

print "\nNumber of indexed documents: %d" % writer.numDocs()
writer.close()
print "Indexing done!\n"
print "------------------------------------------------------"

# --------------------------------------------------------------------------- #
#                    ____      _        _                                     #
#                   |  _ \ ___| |_ _ __(_) _____   _____ _ __                 #
#                   | |_) / _ \ __| '__| |/ _ \ \ / / _ \ '__|                #
예제 #35
0
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory

import sqlite3
import pandas as pd
PATH = ''

if __name__ == "__main__":

    PATH = os.getcwd()
    lucene.initVM()
    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)

    print("%d docs in index" % writer.numDocs())
    print("Reading lines from sys.stdin...")

    con = sqlite3.connect(PATH + '/imdb.db')
    df = pd.read_sql('select * from movies', con)
    con.close()
    for v in df.values:
        doc = Document()
        doc.add(StringField("id", str(v[0]), Field.Store.YES))
        doc.add(TextField("name", v[1], Field.Store.YES))
        doc.add(StringField("year", str(v[2]), Field.Store.YES))
        writer.addDocument(doc)
    print("Indexed %d lines from stdin (%d docs in index)" % (df.shape[0], writer.numDocs()))
    print("Closing index of %d docs..." % writer.numDocs())
    writer.close()
예제 #36
0
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, IntField, StringField, TextField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField}

 
if __name__ == "__main__":
  lucene.initVM()
  indexDir = SimpleFSDirectory(File("lucene/"))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)
 
  print "%d docs in index" % writer.numDocs()
  print "Reading lines from sys.stdin..."
  header=[]
  for n, l in enumerate(sys.stdin):
    doc = Document()
    fields = l.rstrip().split("\t")
    for (idx,field) in enumerate(fields):
        if n == 0:
            typechar = field[-1]
            if typechar not in set(['t','s','i']):
                sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field))
                exit(-1) 
            header.append([field,LUCENE_TYPES[typechar]])
        else:
            (fname,fieldtype) = header[idx]
            if fieldtype is IntField:
예제 #37
0
    # http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/IndexFiles
    # .py?view=markup
    json_file = sys.argv[1]
    index_folder = sys.argv[2]

    glog.setLevel(glog.INFO)
    lucene.initVM()
    store = SimpleFSDirectory(Paths.get(index_folder))
    stop_words = CharArraySet(50, True)
    c_analyzer = ClassicAnalyzer(stop_words)
    analyzer = LimitTokenCountAnalyzer(c_analyzer, 1048576)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(store, config)

    print('%d docs in index' % writer.numDocs())
    print('Indexing json files...')

    # For text field.
    t1 = FieldType()
    t1.setStored(False)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    with codecs.open(json_file, encoding='utf8') as f:
        for line in tqdm(f):
            line = line.strip()
            try:
                json_doc = json.loads(line)
            except:
                glog.warning('Error json parsing: {}'.format(line))
                continue
예제 #38
0
class LuceneSearch():
    """Index and search docs.

    Parameters
    ----------
    index_dir : str
        Index of the documents produced by Lucene
    db_path: str
        File path of the SQLlite database containing articles of wikipedia dump.(from DrQA)
    num_search_workers: int (optional), default=8
        Workers to use to accelerate searching.
    """
    def __init__(self,
                 index_dir: str,
                 db_path: str = None,
                 num_search_workers: int = 8) -> None:

        self.env = lucene.getVMEnv()  # pylint: disable=no-member
        if not self.env:
            self.env = lucene.initVM(
                initialheap='28g',  # pylint: disable=no-member
                maxheap='28g',
                vmargs=['-Djava.awt.headless=true'])

        self.num_search_workers = num_search_workers

        if not os.path.exists(index_dir):
            self.doc_db = DocDB(db_path=db_path)
            logger.info('Creating index at %s', index_dir)
            self._create_index(index_dir)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        self.searcher = IndexSearcher(DirectoryReader.open(fs_dir))
        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=num_search_workers)

    def _create_index(self, index_dir: str) -> None:
        """Index documents

        Parameters
        ----------
        index_dir : str
            The dir to store index
        """
        os.mkdir(index_dir)

        TITLE_FIELD = FieldType()  # pylint: disable=invalid-name
        TITLE_FIELD.setStored(True)
        TITLE_FIELD.setIndexOptions(IndexOptions.DOCS)

        TEXT_FIELD = FieldType()  # pylint: disable=invalid-name
        TEXT_FIELD.setStored(True)
        TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fs_dir, writer_config)
        logger.info("%d docs in index", self.writer.numDocs())
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)

            doc = Document()
            doc.add(Field("title", doc_id, TITLE_FIELD))
            doc.add(Field("text", text, TEXT_FIELD))

            self.writer.addDocument(doc)

        logger.info("Indexed %d docs.", self.writer.numDocs())
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()

    def _search_multithread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        args = [(query, doc_max) for query in queries]
        queries_results = self.pool.starmap(self._search_multithread_part,
                                            args)
        return queries_results

    def _search_multithread_part(
            self, query: str,
            doc_max: int) -> List[Dict[str, Union[float, str]]]:
        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            query = QueryParser('text',
                                self.analyzer).parse(QueryParser.escape(query))
        except Exception as exception:  # pylint: disable=broad-except
            logger.warning(colored(f'{exception}: {query}, use query dummy.'),
                           'yellow')
            query = QueryParser('text', self.analyzer).parse('dummy')

        query_results = []
        hits = self.searcher.search(query, doc_max)

        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)

            query_results.append({
                'score': hit.score,
                'title': doc['title'],
                'text': doc['text']
            })

        if not query_results:
            logger.warning(
                colored(
                    f'WARN: search engine returns no results for query: {query}.',
                    'yellow'))

        return query_results

    def _search_singlethread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        queries_result = []
        for query in queries:
            try:
                query = QueryParser('text', self.analyzer).parse(
                    QueryParser.escape(query))
            except Exception as exception:  # pylint: disable=broad-except
                logger.warning(
                    colored(f'{exception}: {query}, use query dummy.'),
                    'yellow')
                query = QueryParser('text', self.analyzer).parse('dummy')

            query_results = []
            hits = self.searcher.search(query, doc_max)

            for hit in hits.scoreDocs:
                doc = self.searcher.doc(hit.doc)

                query_results.append({
                    'score': hit.score,
                    'title': doc['title'],
                    'text': doc['text']
                })

            if not query_results:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {query}.',
                        'yellow'))

            queries_result.append(query_results)

        return queries_result

    def search(self,
               query: str,
               doc_max: int = 20) -> List[Dict[str, Union[float, str]]]:
        """Search a given query.

        Parameters
        ----------
        query : str
            Anything you want to search
        doc_max : int
            Maximum number of result to return

        Returns
        -------
        Tuple[Any]
            Search results.
        """
        return self.batch_search([query], doc_max=doc_max)[0]

    def batch_search(
            self,
            queries: List[str],
            doc_max: int = 20) -> List[List[Dict[str, Union[float, str]]]]:
        """
        Search a list of queries.

        Parameters
        ----------
        queries : List[str]
            queries list
        doc_max : int, optional, default=20
            maximum number of docs returned by the search engine.

        Returns
        -------
        List[Tuple[Any]]
            Result returned by the search engine.
        """
        if self.num_search_workers > 1:
            result = self._search_multithread(queries, doc_max)
        else:
            result = self._search_singlethread(queries, doc_max)

        return result

    @staticmethod
    def pprint(search_result: List[Dict[str, Union[float, str]]]) -> None:
        """Print the results returned by the doc searcher.

        Parameters
        ----------
        search_result : List[Dict[str, Union[float, str]]]
            Results returned from ranker
        """

        headers = ['Rank', 'Title', 'Text', 'Score']
        table = prettytable.PrettyTable(headers)
        for i, result in enumerate(search_result):
            text, title = result['text'], result['title']
            text = text[:100] + ' ...' if len(text) > 100 else text
            title = title[:30] + ' ...' if len(title) > 30 else title
            table.add_row([i, title, text, '%.5g' % result['score']])
        print('Top Results:')
        print(table)
예제 #39
0
class IndexFiles(object):

    def __init__(self, indexDir):
        if not os.path.exists(indexDir):
            os.mkdir(indexDir)

        store = SimpleFSDirectory(File(indexDir))

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        
        self.writer = IndexWriter(store, config)

    def index(self, file, duplicates):
        exact = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] == 1]
        near = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] < 1]

        with open(file) as file:
            for document in file:
                data = json.loads(document)
                if (data['url'] in exact):
                    continue

                doc = self.createDoc(data['url'], data['html'], data['url'] in near)
                self.writer.addDocument(doc)
                store_outlinks(data['url'], data['outlinks'])

    	self.writer.commit()

        return self.writer.numDocs()

    def createDoc(self, url, html, duplicate):
        title, contents = self.parseHtml(url, html)

        doc = Document()
        doc.add(StringField("title", title, Field.Store.YES))
        doc.add(StringField("url", url, Field.Store.YES))
        doc.add(StringField("duplicate", str(duplicate).lower(), Field.Store.YES))

        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print "Warning: No content in %s" % url

        return doc

    def close(self):
    	self.writer.close()

    def parseHtml(self, url, html):
        soup = BeautifulSoup(html, 'lxml')
        title = self.getTitle(url, soup)
        body = self.getBody(soup)

        return title, body

    def getTitle(self, url, soup):
        if soup.title:
            title = soup.title.get_text().strip()
        elif soup.find("h1"):
            title = " ".join(soup.find("h1").get_text().split())
        else:
            title = url.split("/")[-1]

        return title

    def getBody(self, soup):
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
        [style.decompose() for style in soup.find_all('style')]
        [script.decompose() for script in soup.find_all('script')]

        if soup.body:
            return soup.body.get_text(" ", strip=True)
        else:
            return soup.get_text(" ", strip=True)