示例#1
0
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
示例#2
0
def wikipedia_indexer(storage, wikipedia_file):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    f = open(wikipedia_file)

    for i, line in enumerate(f):
        text = line.strip().decode('utf-8').split('\t')
        title = text[0]
        if 'disambigu' in text[0] or len(text) < 2:
            continue
        text = text[1]
        doc = Document()
        doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
        writer.addDocument(doc)
        if writer.numDocs() % 1000 == 0:
            print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)

    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
示例#3
0
def irsolver(data_file, index):
    from questions import get_input_data
    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(index)))
    searcher = IndexSearcher(reader)
    pred = []
    mapp = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}

    idx, ques, ans = get_input_data(data_file)
    for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)):
        max_score = -1000000
        best_ans = 'A'
        for i, ai in enumerate(a):
            sc = query(q, ai, analyzer, searcher)
            print(acm, i, sc)
            if sc > max_score:
                max_score = sc
                best_ans = mapp[i + 1]
        pred.append(best_ans)

    return idx, pred
示例#4
0
def get_wiki_docids(data_file, wikipedia_index):
    from questions import get_input_data
    data = get_input_data(data_file)

    lucene.initVM()
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
    searcher = IndexSearcher(reader)

    generate_docids(data, data_file, analyzer, searcher)
示例#5
0
    def internal_analyzer(self):
        java_stopwords = [
            "public", "private", "protected", "interface", "abstract",
            "implements", "extends", "null", "new", "switch", "case",
            "default", "synchronized", "do", "if", "else", "break", "continue",
            "this", "assert", "for", "instanceof", "transient", "final",
            "static", "void", "catch", "try", "throws", "throw", "class",
            "finally", "return", "const", "native", "super", "while", "import",
            "package", "true", "false", "enum"
        ]

        all_stopwords = list(
            StandardAnalyzer(Version.LUCENE_CURRENT).getStopwordSet())
        all_stopwords.extend(java_stopwords)

        stopwords = CharArraySet(Version.LUCENE_CURRENT, all_stopwords, True)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT, stopwords)
        return analyzer
示例#6
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
示例#7
0
from org.apache.lucene.analysis.util import CharArraySet

if __name__ == "__main__":

    if len(sys.argv) != 4:

        print sys.argv[0] + ' <documentsDirectory> <stopWords> <indexDirectory>'
        exit()

    documentsDirectory = sys.argv[1]
    stopWords = sys.argv[2]
    indexDirectory = sys.argv[3]

    lucene.initVM()

    exclusionSet = CharArraySet(0, True)
    f = open(stopWords, 'r')
    while 1:
        line = f.readline()
        if not line: break
        exclusionSet.add(line.strip())
    f.close()

    indexDir = SimpleFSDirectory(File(indexDirectory))

    writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT,
                                     SpanishAnalyzer(exclusionSet))

    writer = IndexWriter(indexDir, writerConfig)

    totalDocs = len(glob.glob(documentsDirectory + '/*.xml'))