示例#1
0
 def run(self, todo):
     if todo:
         counter = 0
         if todo == 'things' or todo == 'all':
             print 'dropping things index'
             solr.delete(queries=solr.Q(content_type="thing"))
             solr.commit()
             print 'reindexing things'
             for t in Thing.objects().all():
                 t.add_to_solr(commit=False)
                 if counter == 100:
                     solr.commit()
                     print " 100 done - at ", t.title
                     counter = 0
                 counter += 1
         if todo == 'collections' or todo == 'all':
             print 'dropping collections index'
             solr.delete(queries=solr.Q(content_type="collection"))
             solr.commit()
             print 'reindexing collections'
         if todo == 'makers' or todo == 'all':
             print 'dropping makers index'
             solr.delete(queries=solr.Q(content_type="maker"))
             solr.commit()
             print 'reindexing makers'
             for m in Maker.objects().all():
                 m.add_to_solr(commit=False)
                 if counter == 100:
                     solr.commit()
                     print " 100 done - at ", m.display_name
                     counter = 0
                 counter += 1
         if todo == 'discussions' or todo == 'all':
             print 'dropping discussions index'
             solr.delete(queries=solr.Q(content_type="thread"))
             solr.commit()
             print 'reindexing discussions'
         if todo == 'pages' or todo == 'all':
             print 'dropping pages index'
             solr.delete(queries=solr.Q(content_type="page"))
             solr.commit()
         if todo == 'uploads' or todo == 'all':
             print 'dropping uploads index'
             solr.delete(queries=solr.Q(content_type="upload"))
             solr.commit()
示例#2
0
 def run(self, todo):
     if todo:
         counter = 0
         if todo == 'things' or todo == 'all':
             print 'dropping things index'
             solr.delete(queries=solr.Q(content_type="thing"))
             solr.commit()
             print 'reindexing things'
             for t in Thing.objects().all():
                 t.add_to_solr(commit=False)
                 if counter == 100:
                     solr.commit()
                     print " 100 done - at ", t.title
                     counter = 0
                 counter += 1
         if todo == 'collections' or todo == 'all':
             print 'dropping collections index'
             solr.delete(queries=solr.Q(content_type="collection"))
             solr.commit()
             print 'reindexing collections'
         if todo == 'makers' or todo == 'all':
             print 'dropping makers index'
             solr.delete(queries=solr.Q(content_type="maker"))
             solr.commit()
             print 'reindexing makers'
             for m in Maker.objects().all():
                 m.add_to_solr(commit=False)
                 if counter == 100:
                     solr.commit()
                     print " 100 done - at ", m.display_name
                     counter = 0
                 counter += 1
         if todo == 'discussions' or todo == 'all':
             print 'dropping discussions index'
             solr.delete(queries=solr.Q(content_type="thread"))
             solr.commit()
             print 'reindexing discussions'
         if todo == 'pages' or todo == 'all':
             print 'dropping pages index'
             solr.delete(queries=solr.Q(content_type="page"))
             solr.commit()
         if todo == 'uploads' or todo == 'all':
             print 'dropping uploads index'
             solr.delete(queries=solr.Q(content_type="upload"))
             solr.commit()
示例#3
0
    def index_upload(self, u, force=False):
        """ Indexes a file upload, if possible; forces the issue, if necessary; update """
        # try to get the first page
        def upload_already_indexed(upload):
            ''' Has the upload already been indexed? Look for page 1 '''
            try:
                p = es.get(index="aaaarg", doc_type="page", id="%s_%s" %
                           (str(upload.id), 1), fields='md5')
                return True
            except:
                return False

        try_path = u.full_path()
        n, e = os.path.splitext(try_path)
        # only handle pdfs
        if not e == '.pdf':
            return False
        # Determine the job
        is_indexed = upload_already_indexed(u)
        needs_extraction = force or not is_indexed
        _illegal_xml_chars_RE = re.compile(
            u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]')
        # Try to extract
        if needs_extraction:
            print "Opening", u.structured_file_name, "for extraction"
            try:
                pages = Pdf(try_path).dump_pages()
                num_pages = len(pages)
            except:
                return False
        else:
            try:
                num_pages = Pdf(try_path).npages
            except:
                return False
        # This is the base document
        t = Thing.objects(files=u)[0]
        body = {
            'md5': u.md5,
            'thing': str(t.id),
            'title': t.title,
            'makers': [str(m.maker.id) for m in t.makers],
            'makers_string': t.format_makers_string(),
            'collections': [str(c.id) for c in Collection.objects.filter(things__thing=t)],
            'page_count': len(pages),
            'page': 1,
        }

        if needs_extraction and pages:
            for page_num, content in pages.iteritems():
                if content:
                    print "Page:", page_num
                    id = "%s_%s" % (str(u.id), page_num)
                    try:
                        content = unicode(content, 'utf-8')
                        content = unidecode(content)
                    except:
                        pass
                    # re.sub(_illegal_xml_chars_RE, '?', content)
                    body['searchable_text'] = content
                    body['page'] = page_num
                    es.index(
                        index="aaaarg",
                        doc_type="page",
                        id=id,
                        body=body)
        elif not needs_extraction:
            print "Updating ", num_pages, "pages - extraction not needed."
            for page_num in range(num_pages):  # 0 index, needs to be corrected
                id = "%s_%s" % (str(u.id), page_num + 1)
                body['page'] = page_num + 1
                es.update(
                    index="aaaarg",
                    doc_type="page",
                    id=id,
                    body={'doc': body})
示例#4
0
    def index_upload(self, u, force=False):
        """ Indexes a file upload, if possible; forces the issue, if necessary; update """

        # try to get the first page
        def upload_already_indexed(upload):
            ''' Has the upload already been indexed? Look for page 1 '''
            try:
                p = es.get(index="aaaarg",
                           doc_type="page",
                           id="%s_%s" % (str(upload.id), 1),
                           fields='md5')
                return True
            except:
                return False

        try_path = u.full_path()
        n, e = os.path.splitext(try_path)
        # only handle pdfs
        if not e == '.pdf':
            return False
        # Determine the job
        is_indexed = upload_already_indexed(u)
        needs_extraction = force or not is_indexed
        _illegal_xml_chars_RE = re.compile(
            u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]')
        # Try to extract
        if needs_extraction:
            print "Opening", u.structured_file_name, "for extraction"
            try:
                pages = Pdf(try_path).dump_pages()
                num_pages = len(pages)
            except:
                return False
        else:
            try:
                num_pages = Pdf(try_path).npages
            except:
                return False
        # This is the base document
        t = Thing.objects(files=u)[0]
        body = {
            'md5':
            u.md5,
            'thing':
            str(t.id),
            'title':
            t.title,
            'makers': [str(m.maker.id) for m in t.makers],
            'makers_string':
            t.format_makers_string(),
            'collections':
            [str(c.id) for c in Collection.objects.filter(things__thing=t)],
            'page_count':
            len(pages),
            'page':
            1,
        }

        if needs_extraction and pages:
            for page_num, content in pages.iteritems():
                if content:
                    print "Page:", page_num
                    id = "%s_%s" % (str(u.id), page_num)
                    try:
                        content = unicode(content, 'utf-8')
                        content = unidecode(content)
                    except:
                        pass
                    # re.sub(_illegal_xml_chars_RE, '?', content)
                    body['searchable_text'] = content
                    body['page'] = page_num
                    es.index(index="aaaarg", doc_type="page", id=id, body=body)
        elif not needs_extraction:
            print "Updating ", num_pages, "pages - extraction not needed."
            for page_num in range(num_pages):  # 0 index, needs to be corrected
                id = "%s_%s" % (str(u.id), page_num + 1)
                body['page'] = page_num + 1
                es.update(index="aaaarg",
                          doc_type="page",
                          id=id,
                          body={'doc': body})