예제 #1
0
def reindex_all(reader, writer, analyzer):
    for i in xrange(reader.maxDoc()):
        if reader.isDeleted(i): continue
        doc = reader.document(i)
        p = doc.get("path")
        pkid = doc.get('txtorg_id')
        if p is None:
            # No filepath specified, just use original document
            writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer)
        else:
            # if a path field is found, try to read the file it points to and add a contents field
            edited_doc = Document()
            for f in doc.getFields():
                edited_doc.add(Field.cast_(f))

            try:
                inf = open(p)
                contents = unicode(inf.read(), 'UTF-8')
                inf.close()

                if len(contents) > 0:
                    edited_doc.add(Field("contents", contents,
                                         Field.Store.NO,
                                         Field.Index.ANALYZED,
                                         Field.TermVector.YES))
                else:
                    print "warning: no content in %s" % filename
            except:
                print "Could not read file; skipping"
            writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
예제 #2
0
def write_metadata(searcher, reader, document_ids, fname):
    allFields = set([])
    docFields = []

    for txtorg_id in document_ids:
        query = TermQuery(Term('txtorg_id',txtorg_id))
        scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
        assert len(scoreDocs) == 1
        scoreDoc = scoreDocs[0]
        doc = searcher.doc(scoreDoc.doc)
        df = {}
        for f in doc.getFields():
            field = Field.cast_(f)
            df[field.name()] = field.stringValue()
        docFields.append(df)
        allFields = allFields.union(set(df.keys()))

    
    fields = ['name','path'] + sorted([x for x in allFields if x not in ['name','path']])
    with codecs.open(fname, 'w', encoding='UTF-8') as outf:
        dw = DictUnicodeWriter(outf, fields)
        
        # writing header
        dhead = dict()
        for k in fields:
            dhead[k] = k
        dw.writerow(dhead)
        
        # writing data
        for d in docFields:
            dw.writerow(d)
예제 #3
0
    def findWildcard(self, word, field='key', max=10):
        query = WildcardQuery(Term(field, word))
        searcher = self.searcher
        hits = searcher.search(query, None, max)
        recs = []
        fields = self.fields

        for hit in hits.scoreDocs:
            # i can't figure out how to deal with ScoreDocs instance
            # does it already hold doc object?
            doc = searcher.doc(hit.doc)
            recs.append(doc)

        out = []
        if fields:
            for doc in recs:
                r = {}
                for f in fields:
                    r[f] = doc.get(f)
                out.append(r)
        else:
            for doc in recs:
                r = {}
                for f in doc.fields():
                    f = Field.cast_(f)
                    r[f.name()] = f.stringValue()
                out.append(r)
        return out
예제 #4
0
def get_fields_and_values(reader, max_vals = 30):
    all_fields = defaultdict(set)

    for i in xrange(reader.maxDoc()):
        if reader.isDeleted(i): continue
        doc = reader.document(i)
        for f in doc.getFields():
            field = Field.cast_(f)
            if len(all_fields[field.name()]) < max_vals: all_fields[field.name()].add(field.stringValue())

    return dict(all_fields)
예제 #5
0
def write_contents(allDicts, searcher, reader, fname, content_field = "contents"):
    all_ids = [d['txtorg_id'] for d in allDicts]

    all_fields = set()
    doc_fields = []
    for txtorg_id in all_ids:
        query = TermQuery(Term('txtorg_id',txtorg_id))
        scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
        assert len(scoreDocs) == 1
        scoreDoc = scoreDocs[0]
        doc = searcher.doc(scoreDoc.doc)
        df = {}
        name_path_present = False
        failFlag = False
        for f in doc.getFields():
            field = Field.cast_(f)
            if content_field == "contents" and field.name() == 'path':
                name_path_present = True
                path = doc.get("path").encode('utf-8')
                try:
                    i = codecs.open(path, 'r', encoding='UTF-8')
                    c = i.read()
                    df[content_field] = c
                    i.close()
                except Exception as e:
                    failFlag = True
                    print "Failed for path %s with exception %s" % (path, e)
            elif field.name() in ['txtorg_id', 'name', 'path', content_field]:
                df[field.name()] = field.stringValue()
        
        all_fields = all_fields.union(set(df.keys()))
        doc_fields.append(df)

    fields = ['txtorg_id'] + sorted([x for x in all_fields if x != 'txtorg_id'])
    with codecs.open(fname, 'w', encoding='UTF-8') as outf:
        dw = csv.DictWriter(outf, fields)
        dw.writeheader()
        
        # writing data
        for d in doc_fields:
            dw.writerow(d)

    return failFlag
예제 #6
0
def add_metadata_to_doc(lucenedoc,fieldnames,values):
    edited_doc = Document()
    filepath = lucenedoc.get("path")
    assert filepath is not None

    # Include all original fields that are not in the list of updates
    original_fields = []
    for f in lucenedoc.getFields():
        field = Field.cast_(f)
        if field.name() not in fieldnames:
            original_fields.append(field)

    for field in original_fields:
        edited_doc.add(field)
                
    # Now, add back the unstored "contents" field
    try:
        file = open(filepath)
        contents = unicode(file.read(), 'UTF-8')
        file.close()

        if len(contents) > 0:
            edited_doc.add(Field("contents", contents,
                                 Field.Store.NO,
                                 Field.Index.ANALYZED,
                                 Field.TermVector.YES))
        else:
            print "warning: no content in %s" % filename
    except:
        print "Could not read file; skipping"
        return None

    # Now include new fields
    for idx in range(len(fieldnames)):
        edited_doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED))

    return edited_doc