Python gdocs_to_cnxml示例

编程语言: Python

命名空间/包名称: gdocs2cnxml

方法/功能: gdocs_to_cnxml

hotexamples.com的示例: 2

Python gdocs_to_cnxml - 已找到2个示例。这些是从开源项目中提取的最受好评的gdocs2cnxml.gdocs_to_cnxml现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： testbed_gdocs.py 项目： yingjin/oerpub.rhaptoslabs.html_gdocs2cnxml

def main():
    # keep sure Java is installed (needed for Jing)
    if not java_installed():
        print "ERROR: Could not find Java. Please keep sure that Java is installed and available."
        exit(1)
    # delete the contents of the testbed folder
    delete_all_contents_of_folder(TESTBED_OUTPUT_DIR)
    # login to gdocs and get a client object
    gd_client = getAuthorizedGoogleDocsClient()
    # open file with GDocs public documents URLs (<- the testbed for GDocs)
    url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE))
    for url in url_file:
        if not url.startswith('#'):   # ignore comments
            # check if we really have a gdocs document with an ID
            # Get the ID out of the URL with regular expression
            match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
            if match_doc_id:
                doc_id = match_doc_id.group(1)

                # create a sub directory named like the ID
                doc_output_dir = os.path.join(TESTBED_OUTPUT_DIR, doc_id)
                try:
                    os.mkdir(doc_output_dir)
                except OSError:
                    pass    # If subdirectory already exists do nothing

                doc_key = 'document:' +  doc_id

                print_status('Getting ' + doc_key)

                # get the Google Docs Entry
                gd_entry = gd_client.GetDoc(doc_key)

                # Get the contents of the document
                gd_entry_url = gd_entry.content.src     # should be the same as url, but better ask API for url
                html = gd_client.get_file_content(gd_entry_url)     # requires a URL

                # write testbed source html output
                html_filename = os.path.join(doc_output_dir, doc_id +'.htm')
                html_file = open(html_filename, 'w')
                try:
                    html_file.write(html)
                    html_file.flush()
                finally:
                    html_file.close()

                print_status('Transforming and get images from %s' % doc_key)

                # transformation and get images
                cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)

                # write testbed images
                for image_filename, image in objects.iteritems():
                    image_filename = os.path.join(doc_output_dir, image_filename)
                    image_file = open(image_filename, 'wb') # write binary, important!
                    try:
                        image_file.write(image)
                        image_file.flush()
                    finally:
                        image_file.close()

                # write testbed CNXML output
                cnxml_filename = os.path.join(doc_output_dir, doc_id + '.xml')
                cnxml_file = open(cnxml_filename, 'w')
                try:
                    cnxml_file.write(cnxml)
                    cnxml_file.flush()
                finally:
                    cnxml_file.close()

                # validate CNXML output with Jing Relax NG
                if len(sys.argv) > 1 and sys.argv[1] == '-noval':
                    print_status('Validation skipped')
                else:
                    print_status('Validating %s' % doc_key)
                    jing_log_filename = os.path.join(doc_output_dir, doc_id + '.log')
                    jing_validate_file(cnxml_filename, jing_log_filename)

    print_status('Finished!')

示例#2

显示文件

文件： testbed_gdocs_simple_http.py 项目： oerpub/oerpub.rhaptoslabs.html_gdocs2cnxml

def main():
    # keep sure Java is installed (needed for Jing)
    if not java_installed():
        print "ERROR: Could not find Java. Please keep sure that Java is installed and available."
        exit(1)
    # delete the contents of the testbed folder
    delete_all_contents_of_folder(TESTBED_OUTPUT_DIR)
    # open file with GDocs public documents URLs (<- the testbed for GDocs)
    url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE))
    for url in url_file:
        if not url.startswith('#'):   # ignore comments
            # check if we really have a gdocs document with an ID
            # Get the ID out of the URL with regular expression
            match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
            if match_doc_id:
                doc_id = match_doc_id.group(1)

                # create a sub directory named like the ID
                doc_output_dir = os.path.join(TESTBED_OUTPUT_DIR, doc_id)
                try:
                    os.mkdir(doc_output_dir)
                except OSError:
                    pass    # If subdirectory already exists do nothing

                doc_key = 'document:' +  doc_id

                print_status('Getting ' + doc_key)

                # get the Google Docs by fetching the HTML directly

                http = httplib2.Http()
                http.follow_redirects = False
                try:
                    plain_html_url = 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % doc_id
                    print_status('URL: ' + plain_html_url)
                    resp, html = http.request(plain_html_url)
                except HttpError:
                    print "Error: Failed to download Google Docs HTML"
                try:
                    kix_url = 'https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=kix' % doc_id
                    print_status('URL: ' + kix_url)
                    resp, kix = http.request(kix_url)
                except HttpError:
                    print "Error: Failed to download Google Docs Kix"


                # write testbed source html output
                html_filename = os.path.join(doc_output_dir, doc_id +'.htm')
                html_file = open(html_filename, 'w')
                try:
                    html_file.write(html)
                    html_file.flush()
                finally:
                    html_file.close()

                print_status('Transforming and get images from %s' % doc_key)

                # transformation and get images
                cnxml, objects = gdocs_to_cnxml(html, kixcontent=kix, bDownloadImages=True)

                # write testbed images
                for image_filename, image in objects.iteritems():
                    image_filename = os.path.join(doc_output_dir, image_filename)
                    image_file = open(image_filename, 'wb') # write binary, important!
                    try:
                        image_file.write(image)
                        image_file.flush()
                    finally:
                        image_file.close()

                # write testbed CNXML output
                cnxml_filename = os.path.join(doc_output_dir, doc_id + '.xml')
                cnxml_file = open(cnxml_filename, 'w')
                try:
                    cnxml_file.write(cnxml)
                    cnxml_file.flush()
                finally:
                    cnxml_file.close()

                # validate CNXML output with Jing Relax NG
                if len(sys.argv) > 1 and sys.argv[1] == '-noval':
                    print_status('Validation skipped')
                else:
                    print_status('Validating %s' % doc_key)
                    jing_log_filename = os.path.join(doc_output_dir, doc_id + '.log')
                    jing_validate_file(cnxml_filename, jing_log_filename)

    print_status('Finished!!!')