Пример #1
0
def test_lists_with_styles():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'lists_with_styles.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <ol data-list-type="decimal">
            <li>AAA</li>
            <li>BBB
                <ol data-list-type="lower-roman">
                    <li>CCC</li>
                    <li>DDD
                        <ol data-list-type="upper-alpha">
                            <li>EEE
                                <ol data-list-type="lower-alpha">
                                    <li>FFF</li>
                                </ol>
                            </li>
                        </ol>
                    </li>
                </ol>
            </li>
        </ol>
    </html>
    ''')
def docx_converter():

    while True:
        docx_path = raw_input('Please enter path to folder containing .docx files:')
        docx_path_exists = os.path.exists(docx_path)

        if docx_path_exists:
            write_path = raw_input('Please enter path to write html files to (conversion begins automatically): ')
            write_path_exists = os.path.exists(write_path)

            if write_path_exists:
                    for subdir, dirs, files in os.walk(docx_path):
                        for file in files:
                            ext = os.path.splitext(file)[-1].lower()

                            if ext == '.docx':
                                file_path = os.path.join(docx_path, file)
                                html = convert(file_path, image_handler=handle_image)
                                print 'Converting: ' + file

                                # Give new html file same name as .docx original
                                html_file_path = os.path.join(write_path, os.path.splitext(file)[0])
                                html_file = open(html_file_path, 'w')
                                html_file.write(html)
                                html_file.close()

            else:
                print 'Please enter a valid path.'


        else:
            print 'Please enter a valid path.'
Пример #3
0
def test_tables_in_lists():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'tables_in_lists.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <ol data-list-type="decimal">
            <li>AAA</li>
            <li>BBB<br />
                <table>
                    <tr>
                        <td>CCC</td>
                        <td>DDD</td>
                    </tr>
                    <tr>
                        <td>EEE</td>
                        <td>FFF</td>
                    </tr>
                </table>
            </li>
            <li>GGG</li>
        </ol>
    </html>
    ''')
Пример #4
0
def test_bigger_font_size_to_header():
    # Show when it is appropriate to convert p tags to h tags based on font
    # size.
    if not DETECT_FONT_SIZE:
        raise SkipTest('Font size detection is disabled.')
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'bigger_font_size_to_header.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <p>Paragraphs:</p>
        <h2>Header</h2>
        <p>paragraph 1</p>
        <p>Lists:</p>
        <ol data-list-type="decimal">
            <li>bigger</li>
            <li>smaller</li>
        </ol>
        <p>Tables:</p>
        <table>
            <tr>
                <td>bigger</td>
                <td>smaller</td>
            </tr>
        </table>
    </html>
    ''')
Пример #5
0
def test_shift_enter():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'shift_enter.docx',
    )

    # Test just the convert without clean_html to make sure the first
    # break tag is present.
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <p>AAA<br />BBB</p>
        <p>CCC</p>
        <ol data-list-type="decimal">
            <li>DDD<br />EEE</li>
            <li>FFF</li>
        </ol>
        <table>
            <tr>
                <td>GGG<br />HHH</td>
                <td>III<br />JJJ</td>
            </tr>
            <tr>
                <td>KKK</td>
                <td>LLL</td>
            </tr>
        </table>
    </html>
    ''')
Пример #6
0
def test_convert_p_to_h():
    # Show when it is correct to convert a p tag to an h tag based on
    # bold/italics
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'convert_p_to_h.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <h2>AAA</h2>
        <h2>BBB</h2>
        <p>CCC</p>
        <ol data-list-type="decimal">
            <li><strong>DDD</strong></li>
            <li><em>EEE</em></li>
            <li>FFF</li>
        </ol>
        <table>
            <tr>
                <td><strong>GGG</strong></td>
                <td><em>HHH</em></td>
            </tr>
            <tr>
                <td>III</td>
                <td>JJJ</td>
            </tr>
        </table>
    </html>
    ''')
Пример #7
0
def test_fall_back():
    file_path = 'test.doc'

    def fall_back(*args, **kwargs):
        return 'success'
    html = convert(file_path, fall_back=fall_back, converter=_converter)
    assert html == 'success'
Пример #8
0
def docxView(request):
    #TODO: все к херам переписать по уму
    from shutil import copyfile
    from docx2html import convert

    def handle_image(image_id, relationship_dict):
        image_path = relationship_dict[image_id]
        # Now do something to the image. Let's move it somewhere.
        _, filename = os.path.split(image_path)
        destination_path = os.path.join(MEDIA_ROOT, filename)
        copyfile(image_path, destination_path)

        # Return the `src` attribute to be used in the img tag
        return '/protected%s%s' % (MEDIA_URL, filename)

    fp = request.GET.get('f', None)
    html = None
    if fp:
        try:
            pm_file = PM_Files.objects.get(pk=int(fp))
            if pm_file.type == 'docx':
                html = convert(str(pm_file.file.path),
                               image_handler=handle_image)
            elif pm_file.type == 'xlsx':
                html = excelToHtml(str(pm_file.file.path))
        except PM_Files.DoesNotExist:
            pass
    return HttpResponse(html)
Пример #9
0
def test_nested_tables():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'nested_tables.docx',
    )
    actual_html = convert(file_path)
    # Find out why br tag is there.
    assert_html_equal(actual_html, '''
    <html>
        <table>
          <tr>
            <td>AAA</td>
            <td>BBB</td>
          </tr>
          <tr>
            <td>CCC</td>
            <td>
              <table>
                <tr>
                  <td>DDD</td>
                  <td>EEE</td>
                </tr>
                <tr>
                  <td>FFF</td>
                  <td>GGG</td>
                </tr>
              </table>
              <br />
            </td>
          </tr>
        </table>
    </html>
    ''')
Пример #10
0
def test_list_to_header():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'list_to_header.docx',
    )
    actual_html = convert(file_path)
    # It should be noted that list item `GGG` is upper roman in the word
    # document to show that only top level upper romans get converted.
    assert_html_equal(actual_html, '''
    <html>
        <h2>AAA</h2>
        <ol data-list-type="decimal">
            <li>BBB</li>
        </ol>
        <h2>CCC</h2>
        <ol data-list-type="decimal">
            <li>DDD</li>
        </ol>
        <h2>EEE</h2>
        <ol data-list-type="decimal">
            <li>FFF
                <ol data-list-type="upper-roman">
                    <li>GGG</li>
                </ol>
            </li>
        </ol>
    </html>
    ''')
Пример #11
0
def test_extract_html():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'simple.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <p>
          Simple text
        </p>
        <ol data-list-type="decimal">
          <li>one</li>
          <li>two</li>
          <li>three</li>
        </ol>
        <table>
          <tr>
            <td>Cell1</td>
            <td>Cell2</td>
          </tr>
          <tr>
            <td>Cell3</td>
            <td>cell4</td>
          </tr>
        </table>
    </html>
    ''')
Пример #12
0
def test_nested_table_rowspan():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'nested_table_rowspan.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <table>
            <tr>
                <td colspan="2">AAA</td>
            </tr>
            <tr>
                <td>BBB</td>
                <td>
                    <table>
                        <tr>
                            <td rowspan="2">CCC</td>
                            <td>DDD</td>
                        </tr>
                        <tr>
                            <td>EEE</td>
                        </tr>
                    </table>
                    <br />
                </td>
            </tr>
        </table>
    </html>
    ''')
Пример #13
0
def test_has_title():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'has_title.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''<html><p>Text</p></html>''')
Пример #14
0
def test_unicode():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'greek_alphabet.docx',
    )
    actual_html = convert(file_path)
    assert actual_html is not None
Пример #15
0
def convert_doc_to_html(input, outdir):  # LibreOffice
    print(input, '-->', outdir)
    #subprocess.call(['soffice', '--headless', '--convert-to','html:XHTML Writer File:UTF8','--outdir', outdir, input])
    #args = ['soffice', '--headless', '--convert-to', 'html:XHTML Writer File:UTF8', '--outdir', outdir, input]
    #call_soffice(args)
    html_parser = HTMLParser.HTMLParser()

    html = convert(input)  #使用docx2html模块将docx文件转成html串,随后你想干嘛都行

    html_parser.unescape(html)  #这句非常关键,docx2html模块将中文进行了转义,所以要将生成的字符串重新转义回来!
Пример #16
0
def test_special_chars():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'special_chars.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html><p>&amp; &lt; &gt; <a href="https://www.google.com/?test=1&amp;more=2">link</a></p></html>''')  # noqa
Пример #17
0
def test_inline_tags():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'inline_tags.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html><p>This sentence has some <strong>bold</strong>, some <em>italics</em> and some <strong>underline</strong>, as well as a <a href="http://www.google.com/">hyperlink</a>.</p></html>''')  # noqa
Пример #18
0
def readdoc(sfp, dfp):
    print(sfp)
    print(dfp)
    docn = convert(sfp)
    html_parser = HTMLParser.HTMLParser()
    htmltemp = html_parser.enescape(docn)
    print('读取docx文件成功')
    with open(dfp, 'w', encoding='utf-8') as f:
        f.write(htmltemp)
        print('写入docx文件成功')
    pass  #对读取的world文件输出成html文件还要进行优化,按照客户的需求
Пример #19
0
def test_html_files(patch_zip_handler, patch_read):
    def raise_assertion(*args, **kwargs):
        raise AssertionError('Should not have called get_zip_file_handler')
    patch_zip_handler.side_effect = raise_assertion

    def return_text(*args, **kwargs):
        return 'test'
    patch_read.side_effect = return_text

    # Try with an html file
    file_path = 'test.html'

    html = convert(file_path)
    assert html == 'test'

    # Try again with an htm file.
    file_path = 'test.htm'

    html = convert(file_path)
    assert html == 'test'
Пример #20
0
def test_table_col_row_span():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'table_col_row_span.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
      <table>
        <tr>
          <td colspan="2">AAA</td>
        </tr>
        <tr>
          <td rowspan="2">BBB</td>
          <td>CCC</td>
        </tr>
        <tr>
          <td>DDD</td>
        </tr>
        <tr>
          <td>EEE</td>
          <td rowspan="2">FFF</td>
        </tr>
        <tr>
          <td>GGG</td>
        </tr>
      </table>
      <table>
        <tr>
          <td>1</td>
          <td>2</td>
          <td>3</td>
          <td>4</td>
        </tr>
        <tr>
          <td>5</td>
          <td colspan="2" rowspan="2">6</td>
          <td>7</td>
        </tr>
        <tr>
          <td>8</td>
          <td>9</td>
        </tr>
        <tr>
          <td>10</td>
          <td>11</td>
          <td>12</td>
          <td>13</td>
        </tr>
      </table>
    </html>
    ''')
Пример #21
0
def test_track_changes_on():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'track_changes_on.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html><p>This was some content.</p></html>
    ''')
Пример #22
0
 def save_editor(self, request, pk=None, *args, **kwargs):
     file_res = File.objects.get(id=pk)
     decrypt_file(
         os.path.join(settings.MEDIA_ROOT + str(
             crud.get(self.table, "*", 'where id=' +
                      pk)[0].get('owner_id')) + '/' +
                      base64.b16encode(file_res.modified_file_name)),
         os.path.join(settings.MEDIA_ROOT + file_res.modified_file_name),
         '123')
     self.org_data = convert(settings.MEDIA_ROOT +
                             FileSerializer(File.objects.get(
                                 id=pk)).data.get('modified_file_name'))
     os.remove(
         os.path.join(settings.MEDIA_ROOT + file_res.modified_file_name))
     if self.org_data != request.data:
         # Activity log
         request_data = {}
         param = {'field': 'file_id', 'file_id': pk, 'label': 'version'}
         track_fields = {
             c.can_read: c.read,
             c.can_write: c.write,
             c.can_delete: c.delete
         }
         request_data.update({'user_id': request.user.id})
         log_view = LogView()
         log_view.generate_log(request_data, param, "", track_fields)
         f = open(
             settings.MEDIA_ROOT + str(
                 crud.get(self.table, "*",
                          'where id=' + pk)[0].get('owner_id')) + '/' +
             file_res.modified_file_name.split('.')[0] + '.html', 'w')
         f.write(request.data['data'].encode())
         f.close()
         os.chdir(settings.MEDIA_ROOT + str(
             crud.get(self.table, "*", 'where id=' +
                      pk)[0].get('owner_id')))
         os.system('unoconv --format=' + file_res.name.split('.')[-1] +
                   ' ' + settings.MEDIA_ROOT + str(
                       crud.get(self.table, "*", 'where id=' +
                                pk)[0].get('owner_id')) + '/' +
                   file_res.modified_file_name.split('.')[0] + '.html')
         time.sleep(3)
         os.remove(settings.MEDIA_ROOT + str(
             crud.get(self.table, "*", 'where id=' +
                      pk)[0].get('owner_id')) + '/' +
                   file_res.modified_file_name.split('.')[0] + '.html')
         encrypt_file(
             os.getcwd() + '/' + file_res.modified_file_name,
             os.getcwd() + '/' +
             base64.b16encode(file_res.modified_file_name), '123')
         os.remove(os.getcwd() + '/' + file_res.modified_file_name)
     return Response({"hai": 'hai'})
Пример #23
0
def main():
    if len(sys.argv) < 3:
        DocxToPdf(sys.argv[1])
    if len(sys.argv) < 3:
        sys.exit(
            "Usage: filename.py mypresentation.pdf / mypresentation.docx Method#"
        )
    src = sys.argv[1]
    ParseMethod = sys.argv[2]
    if "docx" in src:
        html = convert(src)
        Var_E = DataComputer(html)
        if ParseMethod == 1:
            Method1(html, Var_E)
        else:
            Method2(html, Var_E)
    else:
        basedir = os.path.dirname(os.path.realpath(__file__))
        pdfdir = os.path.normpath(basedir + '/pdf')
        docdir = os.path.normpath(basedir + '/doc')
        docxdir = os.path.normpath(basedir + '/docx')
        lowriter = '/usr/bin/soffice'
        outfilter = ':"MS Word 2007 XML"'
        outfilter = "'writer_pdf_import'"
        abspath_pdf = os.path.normpath(os.path.join(pdfdir, src))
        subprocess.call(
            '{0} --infilter={1} --convert-to docx "{3}" --outdir "{2}"'.format(
                lowriter, outfilter, docxdir, abspath_pdf),
            shell=True)
        time.sleep(5)
        new_src = docxdir + '/' + src.split(".pdf")[0] + '.docx'
        html = convert(new_src)
        Var_E = DataComputer(html)
        if ParseMethod == 1:
            Method1(html, Var_E)
        else:
            Method2(html, Var_E)
Пример #24
0
def test_upper_alpha_all_bold():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'upper_alpha_all_bold.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <h2>AAA</h2>
        <h2>BBB</h2>
        <h2>CCC</h2>
    </html>
    ''')
Пример #25
0
def test_headers_with_full_line_styles():
    # Show that if a natural header is completely bold/italics that
    # bold/italics will get stripped out.
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'headers_with_full_line_styles.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <h2>AAA</h2>
        <h2>BBB</h2>
        <h2><strong>C</strong><em>C</em>C</h2>
    </html>
    ''')
Пример #26
0
def test_simple_list():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'simple_lists.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <ol data-list-type="decimal">
            <li>One</li>
        </ol>
        <ul>
            <li>two</li>
        </ul>
    </html>
    ''')
Пример #27
0
def test_split_headers():
    filename = 'split_header.docx'
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'split_header.docx',
    )
    # preserve_images must be true in order for the image to not be removed.
    # This is handled in build_import, however here we need to manually set it
    # to True.
    new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)

    def image_handler(*args, **kwargs):
        return 'test'
    actual_html = convert(new_file_path, image_handler=image_handler)
    assert_html_equal(actual_html, '''
    <html><h2>AAA</h2><p>BBB</p><h2>CCC</h2></html>
    ''')
Пример #28
0
def test_has_image():
    filename = 'has_image.docx'
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'has_image.docx',
    )
    # preserve_images must be true in order for the image to not be removed.
    # This is handled in build_import, however here we need to manually set it
    # to True.
    new_file_path, dp = _copy_file_to_tmp_dir(file_path, filename)

    actual_html = convert(new_file_path)
    assert_html_equal(actual_html, '''
    <html>
    <p>AAA<img src="%s/word/media/image1.gif" height="55" width="260" /></p>
    </html>
    ''' % dp)
Пример #29
0
def test_nested_list():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'nested_lists.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <ol data-list-type="decimal">
            <li>one</li>
            <li>two</li>
            <li>three
                <ol data-list-type="decimal">
                    <li>AAA</li>
                    <li>BBB</li>
                    <li>CCC
                        <ol data-list-type="decimal">
                            <li>alpha</li>
                        </ol>
                    </li>
                </ol>
            </li>
            <li>four</li>
        </ol>
        <ol data-list-type="decimal">
            <li>xxx
                <ol data-list-type="decimal">
                    <li>yyy</li>
                </ol>
            </li>
        </ol>
        <ul>
            <li>www
                <ul>
                    <li>zzz</li>
                </ul>
            </li>
        </ul>
    </html>
    ''')
Пример #30
0
def test_has_image_using_image_handler():
    filename = 'has_image.docx'
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'has_image.docx',
    )
    # preserve_images must be true in order for the image to not be removed.
    # This is handled in build_import, however here we need to manually set it
    # to True.
    new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)

    def image_handler(*args, **kwargs):
        return 'test'
    actual_html = convert(new_file_path, image_handler=image_handler)
    assert_html_equal(actual_html, '''

    <html><p>AAA<img src="test" height="55" width="260" /></p></html>
    ''')
Пример #31
0
def test_fake_headings_by_length():
    # Show that converting p tags to h tags has a length limit. If the p tag is
    # supposed to be converted to an h tag but has more than seven words in the
    # paragraph do not convert it.
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'fake_headings_by_length.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <h2>Heading.</h2>
        <h2>Still a heading.</h2>
        <p>
        <strong>This is not a heading because it is too many words.</strong>
        </p>
    </html>
    ''')
Пример #32
0
def course_syllabus(request, slug):
	course = Course.objects.get(slug=slug)
	syllabus = Syllabus.objects.filter(course=course,user=request.user)
	SyllabusFormset = modelformset_factory(Syllabus, form=SyllabusForm, extra=1, max_num=1)
	formset_syl = SyllabusFormset(request.POST or None, request.FILES or None, queryset=syllabus)
	BASE_DIR = os.path.dirname(os.path.dirname(__file__))
	
	print tempfile.gettempdir()
	if request.method == 'POST':
		if formset_syl.is_valid():
			for form in formset_syl:
				form1 = form.save(commit=False)

				form1.user = request.user
				form1.path = request.get_full_path()
				form1.course = course
				if '.docx' in str(form1.syllabus):
					form1.save()
				else:
					data = {}
					data['response'] = False
					new_data = json.dumps(data)
					return HttpResponse(new_data, content_type='application/json')
				AWS_KEY = #key
				AWS_SECRET = #secret
				aws_connection = S3Connection(AWS_KEY, AWS_SECRET)
				obj = Syllabus.objects.get(course=course,user=request.user)
				bucket_name = #bucket
				key = aws_connection.get_bucket(bucket_name).get_key('media/' + str(obj.syllabus))
				res = key.get_contents_to_filename(BASE_DIR +'/'+key.name)
				
				html = convert(BASE_DIR +'/'+key.name)
				obj.html = html
				obj.save()

				response_data = {}
			    
				response_data['response'] = True
				new_data = json.dumps(response_data)
			        
				return HttpResponse(new_data, content_type='application/json')
Пример #33
0
def test_headers():
    file_path = path.join(
        path.abspath(path.dirname(__file__)),
        '..',
        'fixtures',
        'headers.docx',
    )
    actual_html = convert(file_path)
    assert_html_equal(actual_html, '''
    <html>
        <h2>This is an H1</h2>
        <h3>This is an H2</h3>
        <h4>This is an H3</h4>
        <h5>This is an H4</h5>
        <h6>This is an H5</h6>
        <h6>This is an H6</h6>
        <h6>This is an H7</h6>
        <h6>This is an H8</h6>
        <h6>This is an H9</h6>
        <h6>This is an H10</h6>
    </html>
    ''')
Пример #34
0
def conv_core(docx_filename_path,docx_filename,op_mode):
    if os.path.isfile(docx_filename_path):
        def handle_image(image_id, relationship_dict):
            image_path = relationship_dict[image_id]
            # Now do something to the image. Let's move it somewhere.
            _, filename = os.path.split(image_path)
            #extension = os.path.splitext(filename)[1]
            #new_filename= os.path.splitext(filename)[0]
            filename=filename.replace('image','%s_'%docx_filename)
            destination_path = os.path.join('html/images/screenshots/',filename)
            copyfile(image_path, destination_path)

            # Return the `src` attribute to be used in the img tag
            return 'images/screenshots/%s'%filename
        html = convert(docx_filename_path, image_handler=handle_image)
        html=html.replace('<html>','').replace('</html>','').replace('<p>Group type</p>','')
        html=html.replace('<p>','<h3>',1).replace('</p>','</h3>',1).replace('&#160;','&nbsp;')
        html=html.replace('<ol','<ul').replace('</ol>','</ul>').replace('data-list-type="decimal"','class="number-list"')
        html=html.replace('&#8220;','&ldquo;').replace('<table>','<table border="0" cellpadding="0">')
        html=html.replace('<img','<img class="img-responsive"').replace('height="157" width="624" />','/>')
        with open('template.html', 'r') as myfile:
            data=myfile.read().replace('<!--conv_active_link-->', '<li class="active-topic"><span><a href="%(1)s.html">%(2)s</a></span></li>'%{"1" : docx_filename, "2" : docx_filename}).replace('<!--conv_content-->',html)
        fo = open("html/%s.html"%docx_filename, "wb")
        fo.write(data)
        fo.close()
        shutil.rmtree('docx/word')
        print '******************************************************'
        progressbar('Converting:','Complete')
        print '******************************************************'
        print 'TASK COMPLETED SUCCESSFULLY :)'
        print '-Output path :html/%s.html'%docx_filename
        if op_mode:
            raw_input()
    else:
        print '******************************************************'
        print 'ERROR:'
        print '-File not found in path %s'%docx_filename_path
        print '-Also make sure that the file extension is ".docx"'
        convert_engine()
Пример #35
0
from docx2html import convert

html = convert('./Geit (utkast).docx')
myfile = open('testfile.txt', 'rw+')
myfile.write(html)
myfile.close()