def get_gdoc(gdoc_url, output_folder): gd_client = getAuthorizedGoogleDocsClient() match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', gdoc_url) if match_doc_id: doc_id = match_doc_id.group(1) # create a sub directory named like the ID doc_output_dir = output_folder try: os.mkdir(doc_output_dir) except OSError: pass # If subdirectory already exists do nothing doc_key = 'document:' + doc_id # get the Google Docs Entry #gd_entry = gd_client.GetDoc(doc_key) gd_entry = gd_client.get_resource_by_id(doc_key) # Get the contents of the document gd_entry_url = gd_entry.content.src # should be the same as url, but better ask API for url html = gd_client._get_content(gd_entry_url) # requires a URL html_filename = os.path.join(doc_output_dir, doc_id +'.htm') html_file = open(html_filename, 'w') try: html_file.write(html) html_file.flush() finally: html_file.close() return (gd_entry.resource_id.text, gd_entry.title.text) else: print 'Error matching doc id in get_gdoc' quit()
def main(): # keep sure Java is installed (needed for Jing) if not java_installed(): print "ERROR: Could not find Java. Please keep sure that Java is installed and available." exit(1) # delete the contents of the testbed folder delete_all_contents_of_folder('./gdoc_output') # login to gdocs and get a client object gd_client = getAuthorizedGoogleDocsClient() # open file with GDocs public documents URLs (<- the testbed for GDocs) url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE)) for url in url_file: if not url.startswith('#'): # ignore comments # check if we really have a gdocs document with an ID # Get the ID out of the URL with regular expression rid,original_title = get_gdoc(url, './gdoc_output') html_filename = os.path.join('./gdoc_output', rid[9:]+'.htm') html_file = open(html_filename, 'r') try: html = html_file.readlines() html_file.flush() finally: html_file.close() inline_html = '' for line in html: if(line[len(line)-1] == '\n'): inline_html = inline_html + line[0:len(line)-1] else: inline_html = inline_html + line sections = divide_sections(inline_html) sect_num = 0 for sect in sections: html_filename = './gdoc_output/'+rid[9:]+'_sect_'+str(sect_num)+'.htm' html_file = open(html_filename, 'w') try: html_file.write(sect) html_file.flush() finally: html_file.close() sect_num = sect_num + 1 sect_num = 0 for sect in sections: html_filename = './gdoc_output/'+rid[9:]+'_sect_'+str(sect_num)+'.htm' try: upload_doc(html_filename, 'text/html', original_title+', Section '+str(sect_num)) except KeyboardInterrupt: exit() except: print('Error uploading section '+str(sect_num)+'. Are there images in there?') os.remove(html_filename) sect_num = sect_num + 1 print_status('Finished!')
def upload_doc(filename, handle, new_title): gd_client = getAuthorizedGoogleDocsClient() document = gdata.docs.data.Resource(type='document', title=new_title) path=filename media=gdata.data.MediaSource() media.SetFileHandle(path, handle) document = gd_client.CreateResource(document,media=media) return document.resource_id.text
def upload_doc(filename, handle, new_title): gd_client = getAuthorizedGoogleDocsClient() document = gdata.docs.data.Resource(type='document', title=new_title) path = filename media = gdata.data.MediaSource() media.SetFileHandle(path, handle) document = gd_client.CreateResource(document, media=media) return document.resource_id.text
def get_gdoc(gdoc_url, output_folder): gd_client = getAuthorizedGoogleDocsClient() match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', gdoc_url) if match_doc_id: doc_id = match_doc_id.group(1) # create a sub directory named like the ID doc_output_dir = output_folder try: os.mkdir(doc_output_dir) except OSError: pass # If subdirectory already exists do nothing doc_key = 'document:' + doc_id # get the Google Docs Entry #gd_entry = gd_client.GetDoc(doc_key) gd_entry = gd_client.get_resource_by_id(doc_key) # Get the contents of the document gd_entry_url = gd_entry.content.src # should be the same as url, but better ask API for url html = gd_client._get_content(gd_entry_url) # requires a URL html_filename = os.path.join(doc_output_dir, doc_id + '.htm') html_file = open(html_filename, 'w') try: html_file.write(html) html_file.flush() finally: html_file.close() return (gd_entry.resource_id.text, gd_entry.title.text) else: print 'Error matching doc id in get_gdoc' quit()
def main(): # keep sure Java is installed (needed for Jing) if not java_installed(): print "ERROR: Could not find Java. Please keep sure that Java is installed and available." exit(1) # delete the contents of the testbed folder delete_all_contents_of_folder('./gdoc_output') # login to gdocs and get a client object gd_client = getAuthorizedGoogleDocsClient() # open file with GDocs public documents URLs (<- the testbed for GDocs) url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE)) for url in url_file: if not url.startswith('#'): # ignore comments # check if we really have a gdocs document with an ID # Get the ID out of the URL with regular expression rid, original_title = get_gdoc(url, './gdoc_output') html_filename = os.path.join('./gdoc_output', rid[9:] + '.htm') html_file = open(html_filename, 'r') try: html = html_file.readlines() html_file.flush() finally: html_file.close() inline_html = '' for line in html: if (line[len(line) - 1] == '\n'): inline_html = inline_html + line[0:len(line) - 1] else: inline_html = inline_html + line sections = divide_sections(inline_html) sect_num = 0 for sect in sections: html_filename = './gdoc_output/' + rid[9:] + '_sect_' + str( sect_num) + '.htm' html_file = open(html_filename, 'w') try: html_file.write(sect) html_file.flush() finally: html_file.close() sect_num = sect_num + 1 sect_num = 0 for sect in sections: html_filename = './gdoc_output/' + rid[9:] + '_sect_' + str( sect_num) + '.htm' try: upload_doc(html_filename, 'text/html', original_title + ', Section ' + str(sect_num)) except KeyboardInterrupt: exit() except: print('Error uploading section ' + str(sect_num) + '. Are there images in there?') os.remove(html_filename) sect_num = sect_num + 1 print_status('Finished!')