Пример #1
0
    def convert_server(self, document, filename=''):
        base_url = "http://{0}:{1}".format(self.config.host, self.config.port)
        tika_endpoint = '/'.join((base_url, 'tika'))
        self.log.info('Converting document with tika JAXRS server: %s' %
                      filename)

        if isinstance(document, basestring):
            document = StringIO(document)

        headers = {'Accept': 'text/plain'}
        timeout = self.config.timeout
        response = get_requests_session().put(tika_endpoint,
                                              data=document,
                                              headers=headers,
                                              timeout=timeout)

        status, body = response.status_code, response.content

        if not status == 200:
            msg = ("Conversion with Tika JAXRS server failed "
                   "with status %s. " % status)
            raise TikaConversionError(msg,
                                      status_code=status,
                                      stack_trace=body.strip())

        text = clean_extracted_plaintext(body, filename)
        return text
Пример #2
0
    def convert_server(self, document, filename=''):
        base_url = "http://{0}:{1}".format(self.config.host, self.config.port)
        tika_endpoint = '/'.join((base_url, 'tika'))
        self.log.info(
            'Converting document with tika JAXRS server: %s' % filename)

        if isinstance(document, basestring):
            document = StringIO(document)

        headers = {'Accept': 'text/plain'}
        timeout = self.config.timeout
        response = get_requests_session().put(tika_endpoint,
                                              data=document,
                                              headers=headers,
                                              timeout=timeout)

        status, body = response.status_code, response.content

        if not status == 200:
            msg = ("Conversion with Tika JAXRS server failed "
                   "with status %s. " % status)
            raise TikaConversionError(
                msg, status_code=status, stack_trace=body.strip())

        text = clean_extracted_plaintext(body, filename)
        return text
Пример #3
0
    def convert_local(self, document, filename=''):
        self.log.info('Converting document with LOCAL tika: %s' % filename)
        temp_file = tempfile.NamedTemporaryFile(delete=False)
        copy_stream(document, temp_file)
        temp_file.close()

        try:
            cmd = ' '.join([self.java_path, '-jar', self.jar_path,
                            '-t', temp_file.name])
            try:
                stdout, stderr = run_process(cmd)
            except ProcessError, e:
                msg = "Conversion with local Tika failed."
                stack_trace = e.message
                raise TikaConversionError(msg, stack_trace=stack_trace)

            text = clean_extracted_plaintext(stdout, filename)
            return text
Пример #4
0
    def convert_local(self, document, filename=''):
        self.log.info('Converting document with LOCAL tika: %s' % filename)
        temp_file = tempfile.NamedTemporaryFile(delete=False)
        copy_stream(document, temp_file)
        temp_file.close()

        try:
            cmd = ' '.join(
                [self.java_path, '-jar', self.jar_path, '-t', temp_file.name])
            try:
                stdout, stderr = run_process(cmd)
            except ProcessError, e:
                msg = "Conversion with local Tika failed."
                stack_trace = e.message
                raise TikaConversionError(msg, stack_trace=stack_trace)

            text = clean_extracted_plaintext(stdout, filename)
            return text