Exemplo n.º 1
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file_path = input_file_object.get_input_file_path()
            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_path), self.final_format)

            intermediate_filename = str(time.time()).replace('.', '') + '.html'
            output_file_path = os.path.join(self.tmp_dir, output_file_name)
            intermediate_path = os.path.join(
                self.tmp_dir, intermediate_filename)

            with codecs.open(input_file_path, "r", "utf-8") as f:
                cleaned_content = remove_tags(f.read())
                with open(intermediate_path, 'w') as w:
                    w.write(cleaned_content)

            converter = CONVERTER_LOCATION.format(
                input_file_path=intermediate_path,
                output_file_path=output_file_path)

            self.execute(converter)
            if os.path.isfile(output_file_path):
                return output_file_path
            else:
                self.handle_failed_conversion(input_file_object)

        log.error('Conversion failed from HTML => PDF')
        return None
Exemplo n.º 2
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file_path = input_file_object.get_input_file_path()
            log.info('Converting {} to TXT'.format(input_file_path))
            html_to_text = html2text.HTML2Text()
            html_to_text.ignore_links = html_to_text.ignore_images = True
            try:
                input_stream = input_file_object.get_input_stream()
            except UnicodeDecodeError, e:
                self.handle_failed_conversion(input_file_object)
                log.error(
                    'Conversion failed from HTML => TXT for {} {}'.format(
                        input_file_path, e)
                )
                return None

            soup = BeautifulSoup(input_stream)
            invalid_attrs = 'href src width height target \
            style color face size script'.split()

            for attr in invalid_attrs:
                [dom_el.extract() for dom_el in soup(attr)]
            input_stream = unicode(soup)

            try:
                output_stream = html_to_text.handle(input_stream)
            except e:
                self.handle_failed_conversion(input_file_object)
                log.error(
                    'Conversion failed from HTML => TXT for {} {}'.format(
                        input_file_path, e)
                )
                return None

            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_object.get_input_file_path()),
                self.final_format)
            output_file_path = os.path.join(self.tmp_dir, output_file_name)
            write_stream(output_file_path, output_stream)

            if os.path.isfile(output_file_path):
                return output_file_path
            else:
                self.handle_failed_conversion(input_file_object)
Exemplo n.º 3
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file = input_file_object.get_input_file_path()
            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file), self.final_format)
            output_file_path = os.path.join(self.tmp_dir, output_file_name)

            converter = CONVERTER_LOCATION.format(input_file_path=input_file,
                                                  output_file_dir=self.tmp_dir)

            self.execute(converter)

            if os.path.isfile(output_file_path):
                return output_file_path
            else:
                self.handle_failed_conversion(input_file_object)

                log.error('Conversion failed from PDF => HTML')
        return None
Exemplo n.º 4
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file_path = input_file_object.get_input_file_path()
            log.info('Converting {} to TXT'.format(input_file_path))
            html_to_text = html2text.HTML2Text()
            html_to_text.ignore_links = html_to_text.ignore_images = True
            try:
                input_stream = input_file_object.get_input_stream()
            except UnicodeDecodeError, e:
                self.handle_failed_conversion(input_file_object)
                log.error(
                    'Conversion failed from HTML => TXT for {} {}'.format(
                        input_file_path, e))
                return None

            soup = BeautifulSoup(input_stream)
            invalid_attrs = 'href src width height target \
            style color face size script'.split()

            for attr in invalid_attrs:
                [dom_el.extract() for dom_el in soup(attr)]
            input_stream = unicode(soup)

            try:
                output_stream = html_to_text.handle(input_stream)
            except e:
                self.handle_failed_conversion(input_file_object)
                log.error(
                    'Conversion failed from HTML => TXT for {} {}'.format(
                        input_file_path, e))
                return None

            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_object.get_input_file_path()),
                self.final_format)
            output_file_path = os.path.join(self.tmp_dir, output_file_name)
            write_stream(output_file_path, output_stream)

            if os.path.isfile(output_file_path):
                return output_file_path
            else:
                self.handle_failed_conversion(input_file_object)
Exemplo n.º 5
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file_path = input_file_object.get_input_file_path()
            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_path), 'pdf')

            output_file_path = self.tmp_dir
            converter = CONVERTER_LOCATION.format(
                output_file_path=output_file_path,
                input_file_path=input_file_path)

            self.execute(converter)
            output_file = os.path.join(output_file_path, output_file_name)
            if os.path.isfile(output_file):
                return output_file
            else:
                self.handle_failed_conversion(input_file_object)

        log.error('Conversion failed from DOC => PDF')
        return None
Exemplo n.º 6
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file_path = input_file_object.get_input_file_path()
            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_path), 'pdf')

            output_file_path = TMP_DIR
            converter = CONVERTER_LOCATION.format(
                output_file_path=output_file_path,
                input_file_path=input_file_path)

            self.execute(converter)
            output_file = os.path.join(output_file_path, output_file_name)
            if os.path.isfile(output_file):
                return output_file
            else:
                self.handle_failed_conversion(input_file_path)

        log.error('Conversion failed from RTF => PDF')
        return None
Exemplo n.º 7
0
    def _single_convert(self, input_file_object):
        if input_file_object:
            input_stream = input_file_object.get_input_stream()
            try:
                output_stream = markdown2.markdown(input_stream)
            except:
                print "Conversion Unsuccessfull for txt_html"
                return None
            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_object.get_input_file_path()),
                self.final_format)
            output_file_path = os.path.join(self.tmp_dir, output_file_name)
            write_stream(output_file_path, output_stream)
            if os.path.isfile(output_file_path):
                return output_file_path
            else:
                self.handle_failed_conversion(input_file_object)

        log.error('Conversion failed from TXT => HTML')
        return None
Exemplo n.º 8
0
 def get_remote_location(self):
     filename = rename_filename_with_extension(self.file_instance.filename,
                                               self.output_format)
     return os.path.join(app.config['REMOTE_DUMP_FOLDER'], self.doc_id,
                         filename)