def _process_gdocs_resource(klass, save_dir, html):
     # Transformation and get images
     cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
     cnxml = clean_cnxml(cnxml)
     save_cnxml(save_dir, cnxml, objects.items())
     validate_cnxml(cnxml)
     return "Google Document"
    def process_gdocs_resource(self, save_dir, gdocs_resource_id, gdocs_access_token=None):

        # login to gdocs and get a client object
        gd_client = getAuthorizedGoogleDocsClient()

        # Create a AuthSub Token based on gdocs_access_token String
        auth_sub_token = gdata.gauth.AuthSubToken(gdocs_access_token) \
                         if gdocs_access_token \
                         else None

        # get the Google Docs Entry
        gd_entry = gd_client.GetDoc(gdocs_resource_id, None, auth_sub_token)

        # Get the contents of the document
        gd_entry_url = gd_entry.content.src
        html = gd_client.get_file_content(gd_entry_url, auth_sub_token)

        # Transformation and get images
        cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)

        cnxml = clean_cnxml(cnxml)
        save_cnxml(save_dir, cnxml, objects.items())

        validate_cnxml(cnxml)

        # Return the title and filename.  Old comment states
        # that returning this filename might kill the ability to
        # do multiple tabs in parallel, unless it gets offloaded
        # onto the form again.
        return (gd_entry.title.text, "Google Document")
示例#3
0
 def _process_gdocs_resource(klass, save_dir, html, kix=None):
     # Transformation and get images
     cnxml, objects = gdocs_to_cnxml(html,
                                     kixcontent=kix,
                                     bDownloadImages=True)
     cnxml = clean_cnxml(cnxml)
     save_cnxml(save_dir, cnxml, objects.items())
     validate_cnxml(cnxml)
     return "Google Document"
示例#4
0
def gdoc2html(request):
    session = request.session

    # grab inputs
    if 'html' in request.POST:
        html = request.POST['html']
    else:
        return

    if 'textbook_html' in request.POST:
        textbook_html = request.POST['textbook_html'] is '1'
    else:
        textbook_html = True

    if 'copy_images' in request.POST:
        copy_images = request.POST['copy_images'] is '1'
    else:
        copy_images = False

    # be anonymous
    session['login'] = AnonymousSession()

    # setup work directory: save_dir = transform_dir + user_subdir_name
    transform_dir = request.registry.settings['transform_dir']
    user_subdir_name, save_dir = create_save_dir(request)

    # allow cross domain access
    request.response.headers.add('Access-Control-Allow-Origin', '*')

    # convert gdoc html to cnxml to textbook (aka structured) html or aloha-ready html
    cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=copy_images)
    cnxml = clean_cnxml(cnxml)
    title = None
    metadata = None
    alohareadyhtml, structuredhtml, conversion_error = update_html(
        cnxml, title, metadata)
    if conversion_error is None:
        if textbook_html:
            html = structuredhtml
        else:
            html = alohareadyhtml
    else:
        html = ""

    jsonresult = {
        "html": html,
        "textbook_html": textbook_html,
        "copy_images": copy_images,
    }
    return jsonresult
示例#5
0
    def test_gdocs(self):
        have_test_file = False
        try:
            fp = open('./test_files/gdocs/test_files')
            fp.close()
            have_test_file = True
        except:
            print('No gdocs test file')

        doc_files=os.listdir(test_folder_name+'doc/')
        rids = [ ]
        i=0
        while(i < len(doc_files)):
            f=doc_files[i]
            filename, extension = os.path.splitext(f)
            if(extension != '.doc'):
                doc_files.remove(f)
            else:
                i=i+1
        for d in doc_files:
            try:
                just_filename=os.path.basename(d)
                just_filename, extension = os.path.splitext(just_filename)
                rid = upload_doc(test_folder_name+'doc/'+d, 'application/msword',just_filename)
                rids.append(rid)
            except KeyboardInterrupt:
                exit()
            except :
                print('Error uploading '+just_filename+' to gdocs')

        if(have_test_file):
            fp = open('./test_files/gdocs/test_files')
            for url in fp:
                if(url[0] == '#'):
                    continue
                match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
                if match_doc_id:
                    rids.append('document:'+match_doc_id.group(1))
            fp.close()
                
        count = 0
        for rid in rids:
            if(count < len(doc_files)):
                filename = os.path.basename(doc_files[count])
                filename,ext = os.path.splitext(filename)
            else:
                filename = rid[9:]

            valid_filename='./test_files/gdocs/'+filename+'.cnxml'
            output_filename='./test_files/gdocs/'+filename+'.tmp'
            diff_filename = './test_files/gdocs/'+filename+'.diff'
            err_filename = './test_files/gdocs/'+filename+'.err'

            gdoc_url = construct_url(rid[9:])
            rid,original_title = get_gdoc(gdoc_url, './test_files/gdocs')
            html_filename = './test_files/gdocs/'+rid[9:]+'.htm'
            html_file = open(html_filename, 'r')
            try:
                html = html_file.read()
                html_file.flush()
            finally:
                html_file.close()
            cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
            cnxml = clean_cnxml(cnxml)
            validate_cnxml(cnxml)

            output=open(output_filename,'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            os.remove('./test_files/gdocs/'+rid[9:]+'.htm')

            process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE)
            std_output = process.communicate()

            if(std_output[0] != None and len(std_output[0]) != 0):
                diff_output=open(diff_filename,'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print('Differences in the testing of gdoc '+filename+', information on those differences has been placed in '+diff_filename)
            elif(std_output[1] != None and len(std_output[1]) != 0):
                err_output=open(err_filename,'w')
                err_output.write(std_output[1])
                err_output.close()
                print('Error(s) occurred while attempting to test for differences in CNXML output of gdoc '+filename+', information on these errors are in '+err_filename)
            count = count + 1
from utils import clean_cnxml, escape_system
from test_conversion import validate_cnxml, remove_ids

url = 'https://docs.google.com/document/d/1tiZR1fhBl3ZQ_UaQ5sRDA3gSs_7LjgtTITkBAGjuTpI/edit'
#url='https://docs.google.com/document/d/1Gw9j1J-_d5YQoq6SIc3Az2hiVlwtvVcJkXfYKDR_zBM/edit'

match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
rid = 'document:' + match_doc_id.group(1)

print(rid)
filename = rid[9:]
valid_filename = 'valid.cnxml'
gdoc_url = construct_url(rid[9:])
print(gdoc_url)
rid, original_title = get_gdoc(url, './')
html_filename = './' + rid[9:] + '.htm'
html_file = open(html_filename, 'r')
try:
    html = html_file.read()
    html_file.flush()
finally:
    html_file.close()

cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
cnxml = clean_cnxml(cnxml)
validate_cnxml(cnxml)

output = open(valid_filename, 'w')
output.write(cnxml)
output.close()
示例#7
0
    def test_gdocs(self):
        have_test_file = False
        try:
            fp = open('./test_files/gdocs/test_files')
            fp.close()
            have_test_file = True
        except:
            print('No gdocs test file')

        doc_files = os.listdir(test_folder_name + 'doc/')
        rids = []
        i = 0
        while (i < len(doc_files)):
            f = doc_files[i]
            filename, extension = os.path.splitext(f)
            if (extension != '.doc'):
                doc_files.remove(f)
            else:
                i = i + 1
        for d in doc_files:
            try:
                just_filename = os.path.basename(d)
                just_filename, extension = os.path.splitext(just_filename)
                rid = upload_doc(test_folder_name + 'doc/' + d,
                                 'application/msword', just_filename)
                rids.append(rid)
            except KeyboardInterrupt:
                exit()
            except:
                print('Error uploading ' + just_filename + ' to gdocs')

        if (have_test_file):
            fp = open('./test_files/gdocs/test_files')
            for url in fp:
                if (url[0] == '#'):
                    continue
                match_doc_id = re.match(
                    r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
                if match_doc_id:
                    rids.append('document:' + match_doc_id.group(1))
            fp.close()

        count = 0
        for rid in rids:
            if (count < len(doc_files)):
                filename = os.path.basename(doc_files[count])
                filename, ext = os.path.splitext(filename)
            else:
                filename = rid[9:]

            valid_filename = './test_files/gdocs/' + filename + '.cnxml'
            output_filename = './test_files/gdocs/' + filename + '.tmp'
            diff_filename = './test_files/gdocs/' + filename + '.diff'
            err_filename = './test_files/gdocs/' + filename + '.err'

            gdoc_url = construct_url(rid[9:])
            rid, original_title = get_gdoc(gdoc_url, './test_files/gdocs')
            html_filename = './test_files/gdocs/' + rid[9:] + '.htm'
            html_file = open(html_filename, 'r')
            try:
                html = html_file.read()
                html_file.flush()
            finally:
                html_file.close()
            cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
            cnxml = clean_cnxml(cnxml)
            validate_cnxml(cnxml)

            output = open(output_filename, 'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            os.remove('./test_files/gdocs/' + rid[9:] + '.htm')

            process = subprocess.Popen(
                ['diff', valid_filename, output_filename],
                shell=False,
                stdout=subprocess.PIPE)
            std_output = process.communicate()

            if (std_output[0] != None and len(std_output[0]) != 0):
                diff_output = open(diff_filename, 'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print(
                    'Differences in the testing of gdoc ' + filename +
                    ', information on those differences has been placed in ' +
                    diff_filename)
            elif (std_output[1] != None and len(std_output[1]) != 0):
                err_output = open(err_filename, 'w')
                err_output.write(std_output[1])
                err_output.close()
                print(
                    'Error(s) occurred while attempting to test for differences in CNXML output of gdoc '
                    + filename + ', information on these errors are in ' +
                    err_filename)
            count = count + 1
for rid in rids:
    print(rid)
    if count < len(doc_files):
        filename = os.path.basename(doc_files[count])
        filename, ext = os.path.splitext(filename)
    else:
        filename = rid[9:]

    valid_filename = "./test_files/gdocs/" + filename + ".cnxml"

    gdoc_url = construct_url(rid[9:])
    rid, original_title = get_gdoc(gdoc_url, "./test_files/gdocs")
    html_filename = "./test_files/gdocs/" + rid[9:] + ".htm"
    html_file = open(html_filename, "r")
    try:
        html = html_file.read()
        html_file.flush()
    finally:
        html_file.close()

    cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
    cnxml = clean_cnxml(cnxml)
    validate_cnxml(cnxml)

    output = open(valid_filename, "w")
    output.write(cnxml)
    output.close()
    remove_ids(valid_filename)
    count = count + 1
    os.remove("./test_files/gdocs/" + rid[9:] + ".htm")