def getClient(self): """Returns a spreadsheet client if there's an Internet connection. Returns None if no connection. """ client = SpreadsheetsService() try: client.GetWorksheetsFeed(self.spreadsheet_key, visibility='public', projection='basic') except gaierror: client = None return client
def get_cos_contributors(): ''' returns a list of contribtors from a google spreadsheet specified by spreadsheet ID ''' client = SpreadsheetsService() feed = client.GetWorksheetsFeed(DOC_KEY, visibility='public', wksht_id='1') sheet_key = feed.id.text.rsplit('/', 1)[1] list_of_rows = client.GetListFeed(DOC_KEY, sheet_key, visibility='public').entry print('There are {} rows on the COS spreadsheet.'.format( len(list_of_rows))) cos_users = [] for row in list_of_rows: row_dict = row.custom cos_users.append(row_dict[COL_ROW_NAME].text) return cos_users
class GoogleSpreadsheetReader(object): """ Directs Google Spreadsheets service client to obtain spreadsheet cells. """ def __init__(self, options): """Init the Google Spreadsheets service client.""" self.options = options self.service = GoogleSpreadsheetsService() if not self.options.google_email: print "Warning: Google account email not provided." if not self.options.google_password: print "Warning: Google account password not provided." self.service.email = self.options.google_email self.service.password = self.options.google_password self.service.ProgrammaticLogin() if not self.options.google_spreadsheet_key: print "Warning: Google spreadsheet key not provided." def get_cells(self, sheet_index=0): """Returns a dict of cell data keyed by cell coordinate (row, col).""" cells = {} spreadsheet_key = self.options.google_spreadsheet_key sheets_feed = self.service.GetWorksheetsFeed(spreadsheet_key) sheet_id = sheets_feed.entry[sheet_index].id.text.split('/')[-1] cells_feed = self.service.GetCellsFeed(spreadsheet_key, sheet_id) for entry in cells_feed.entry: try: row_id = entry.cell.row col_id = entry.cell.col data = entry.content.text except Exception, inst: msg = "Couldn't read cell feed entry: %s" % inst msg += "\n%s" % entry raise Exception, msg try: row_id = int(row_id) col_id = int(col_id) except: continue cells[(row_id, col_id)] = data return cells
def load_google_spreadsheet(spreadsheet_key, worksheet_name=None): """Downloads and exports a Google Spreadsheet in TSV format. Returns a string containing the spreadsheet contents in TSV format (e.g. for writing out to a file or parsing). The first line is assumed to be the spreadsheet header (i.e. containing column names), which can optionally be followed by one or more comment lines (starting with '#'). Only the first cell of a comment line will be parsed (to keep exported spreadsheets consistent with QIIME mapping files' comments). The (optional) comments section is then followed by the spreadsheet data. Some of this code is based on the following websites, as well as the gdata.spreadsheet.text_db module: http://www.payne.org/index.php/Reading_Google_Spreadsheets_in_Python http://stackoverflow.com/a/12031835 Arguments: spreadsheet_key - the key used to identify the spreadsheet (a string). Can either be a key or a URL containing the key worksheet_name - the name of the worksheet to load data from (a string). If not supplied, will use first worksheet in the spreadsheet """ spreadsheet_key = _extract_spreadsheet_key_from_url(spreadsheet_key) gd_client = SpreadsheetsService() try: worksheets_feed = gd_client.GetWorksheetsFeed(spreadsheet_key, visibility='public', projection='basic') except gaierror: raise GoogleSpreadsheetConnectionError("Could not establish " "connection with server. Do " "you have an active Internet " "connection?") if len(worksheets_feed.entry) < 1: raise GoogleSpreadsheetError("The Google Spreadsheet with key '%s' " "does not have any worksheets associated " "with it." % spreadsheet_key) # Find worksheet that will be exported. If a name has not been provided, # use the first worksheet. worksheet = None if worksheet_name is not None: for sheet in worksheets_feed.entry: if sheet.title.text == worksheet_name: worksheet = sheet if worksheet is None: raise GoogleSpreadsheetError("The worksheet name '%s' could not " "be found in the Google Spreadsheet " "with key '%s'." % (worksheet_name, spreadsheet_key)) else: # Choose the first one. worksheet = worksheets_feed.entry[0] # Extract the ID of the worksheet. worksheet_id = worksheet.id.text.split('/')[-1] # Now that we have a spreadsheet key and worksheet ID, we can read the # data. First get the headers (first row). We need this in order to grab # the rest of the actual data in the correct order (it is returned # unordered). headers = _get_spreadsheet_headers(gd_client, spreadsheet_key, worksheet_id) if len(headers) < 1: raise GoogleSpreadsheetError("Could not load spreadsheet header (it " "appears to be empty). Is your Google " "Spreadsheet with key '%s' empty?" % spreadsheet_key) # Loop through the rest of the rows and build up a list of data (in the # same row/col order found in the spreadsheet). spreadsheet_lines = _export_spreadsheet(gd_client, spreadsheet_key, worksheet_id, headers) out_lines = StringIO() tsv_writer = writer(out_lines, delimiter='\t', lineterminator='\n') tsv_writer.writerows(spreadsheet_lines) return out_lines.getvalue()
class SheetQuerier: key = "0AkGO8UqErL6idDhYYjg1ZXlORnRaM3ZhTks4Z3FrYlE" worksheets = {"DVD": "movietitle", "Blu-ray": "movietitle"} cache = "spreadsheet_cache.p" cache_timeout = 60 * 60 * 24 * 7 prefixes = ["The", "A"] whitespace_pattern = re.compile('[^a-zA-Z0-9 ]+', re.UNICODE) def __init__(self, force_reload=False): try: pickle_date, self.data = pickle.load(open(self.cache, "rb")) if DEBUG: print "Loaded cache successfully" if force_reload or time.time() - pickle_date > self.cache_timeout: if DEBUG: if force_reload: print "Cache update forced." else: print "Cache too old, reloading." self.reload_data() pickle.dump((time.time(), self.data), open(self.cache, "wb")) except: if DEBUG: print "Problem loading cache, reloading." self.reload_data() pickle.dump((time.time(), self.data), open(self.cache, "wb")) def reload_data(self): self.client = SpreadsheetsService() feed = self.client.GetWorksheetsFeed(self.key, visibility='public', projection='basic') self.data = {} for entry in feed.entry: if entry.title.text in self.worksheets.keys(): bad_rows, total_rows = self.process_sheet( entry.id.text.split("/")[-1], self.worksheets[entry.title.text]) print "Skipped %d / %d rows in sheet \"%s\"" % ( bad_rows, total_rows, entry.title.text) elif DEBUG: print "Skipped sheet \"%s\"" % entry.title.text def process_sheet(self, sheet_key, movie_row_key, type_row_key="forcedsubtitletype"): if DEBUG: print "Document: %s" % self.key print "Sheet: %s" % sheet_key rows = self.client.GetListFeed(self.key, sheet_key, visibility='public', projection='values').entry bad_rows = 0 for row in rows: try: self.data[SheetQuerier.clean_title( row.custom[movie_row_key].text.strip( ))] = row.custom[type_row_key].text.strip() except: bad_rows += 1 return bad_rows, len(rows) def query_exact(self, title): query = SheetQuerier.clean_title(title) if query in self.data: return self.data[query] else: return False @staticmethod def clean_title(title): # Move prefixes for prefix in SheetQuerier.prefixes: if title.endswith(prefix): title = "%s %s" % (prefix, title[:-1 * len(prefix) - 2]) break # Strip all non alpha-numeric characters title = SheetQuerier.whitespace_pattern.sub('', title) # Return the lowercase version return title.lower()
class GDataClient(object): gd_client = None gd_cur_ss_key = None gd_cur_ws_id = None def __init__(self,uname,pswd,document_name=None,worksheet_name=None): # Connect to Google self.gd_client = SpreadsheetsService(email=uname,password=pswd,source=GDATA_SOURCE) self.gd_client.ProgrammaticLogin() if document_name is not None: self.set_document(document_name) if worksheet_name is not None: self.set_worksheet(worksheet_name) def __ss_check(self): ''' Make sure spreadsheet has been set before we try to do anything with worksheets. ''' if self.gd_cur_ss_key is None: raise GDError('Must set spreadsheet before accessing worksheets!') def __header_to_key(self,hdr_string): ''' Google sheets column headers are used as keys to row info dictionaries, but first most non alphanumeric characters are removed and the letters are lower-cased. ''' return ''.join(re.findall('[a-z\-0-9\.]+',hdr_string.lower())) def set_document(self, docname): ''' Set current spreadsheet document given a title. ''' q = DocumentQuery(params={'title':docname,'title-exact':'true'}) ss_feed = self.gd_client.GetSpreadsheetsFeed(query=q) if len(ss_feed.entry) != 1: raise GDError('{} spreadsheets match the given name "{}" (expected exactly one)!'.format(len(ss_feed.entry),docname)) self.gd_cur_ss_key = ss_feed.entry[0].id.text.rsplit('/',1)[1] def list_documents(self): ''' List all spreadsheet documents available. ''' feed = self.gd_client.GetSpreadsheetsFeed(DocumentQuery()) return [en.title.text for en in feed.entry] def list_worksheets(self): ''' List all worksheets in the current spreadsheet document. ''' self.__ss_check() ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key) return [en.title.text for en in ws_feed.entry] def set_worksheet(self, sheetname): ''' Set current worksheet within the current spreadsheet document. ''' self.__ss_check() q = DocumentQuery(params={'title':sheetname,'title-exact':'true'}) ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,query=q) if len(ws_feed.entry) != 1: raise GDError('{} worksheets match the given name "{}" (expected exactly one)!'.format(len(ws_feed.entry),sheetname)) self.gd_cur_ws_id = ws_feed.entry[0].id.text.rsplit('/',1)[1] def add_worksheet(self, title, rows, cols, overwrite=False): ''' Add a worksheet to current spreadsheet document (if it does not exist). Switch current sheet to the (new) guy. ''' self.__ss_check() # First, check to see if the worksheet already exists q = DocumentQuery(params={'title':title,'title-exact':'true'}) ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,query=q) ws_found = None if len(ws_feed.entry) > 0: if overwrite: if not self.gd_client.DeleteWorksheet(ws_feed.entry[0]): raise GDError('Failed to delete existing worksheet named {} to overwrite!'.format(title)) else: ws_found = ws_feed.entry[0] is_new = ws_found is None if is_new: ws_found = self.gd_client.AddWorksheet(title,rows,cols,self.gd_cur_ss_key) self.gd_cur_ws_id = ws_found.id.text.rsplit('/',1)[1] return is_new def set_headers(self,header_names): ''' Set the current worksheet headers given a list. NOTE: header list length must match columns in sheet! ''' self.__ss_check() ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id) if int(ws.col_count.text) != len(header_names): raise GDError('Number of headers ({}) does not match columns in spreadsheet ({})!' .format(len(header_names),int(ws.col_count.text))) query = CellQuery(params={'min-row':'1','max-row':'1','min-col':'1','max-col':str(len(header_names))}) cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=query) if len(cells.entry) == 0: query = CellQuery(params={'min-row':'1','max-row':'1','min-col':'1','max-col':str(len(header_names)),'return-empty':'true'}) cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=query) batchRequest = SpreadsheetsCellsFeed() for idx,val in enumerate(header_names): cells.entry[idx].cell.inputValue = val batchRequest.AddUpdate(cells.entry[idx]) updated = self.gd_client.ExecuteBatch(batchRequest,cells.GetBatchLink().href) def delete_rows(self,row_list): ''' Given a list of row numbers, delete the corresponding rows from the current sheet. @param row_list List of integers in 0:num_rows ''' if len(row_list) == 0: return self.__ss_check() list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id) ordered_list = sorted(row_list,reverse=True) if ordered_list[0] > int(list_feed.total_results.text): raise GDError('Tried to delete row {} but highest row number in sheet is {}!'.format(ordered_list[0],list_feed.total_results.text)) for row in ordered_list: if not self.gd_client.DeleteRow(list_feed.entry[row]): raise GDError('Failed to delete row {} (partway through list: {})'.format(row,ordered_list)) def insert_rows(self,info_dict_list): ''' Add a group of rows based on list of dictionaries. @param info_dict_list Dictionary with keys for any headers with non-blank info in the new row (all missing keys will be have blank data). ''' self.__ss_check() # Check and make sure that none of the input dictionaries contains keys not in the first row of our list. hdr_list = [self.__header_to_key(hdr) for hdr in self.row_as_list(1)] bad_row = next((dd for dd in info_dict_list if len(set(dd.keys()) - set(hdr_list)) > 0), None) if bad_row is not None: raise GDError('Failed to insert row {} because it contains keys not in spreadsheet headers ({})!' .format(bad_row,hdr_list)) list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id) for inf in info_dict_list: self.gd_client.InsertRow(inf,self.gd_cur_ss_key, self.gd_cur_ws_id) def column_as_list(self,column,with_header=False): ''' Read just a single column into a list of strings. Ignore the first row by default because it's the column header. ''' self.__ss_check() if with_header: minrow = 1 else: minrow = 2 ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id) if int(ws.row_count.text) < minrow: return [] q = CellQuery(params={'min-row':str(minrow),'min-col':str(column),'max-col':str(column)}) cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=q) return [cellentry.cell.text for cellentry in cells.entry] def row_as_list(self,row): ''' Read just a single row into a list of strings. NOTE: this is indexed by cell, so row 1 is the header row! ''' self.__ss_check() mincol = 1 ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id) if int(ws.col_count.text) < mincol: return [] q = CellQuery(params={'min-row':str(row),'min-col':str(mincol),'max-row':str(row)}) cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=q) return [cellentry.cell.text for cellentry in cells.entry] def read_to_list(self,num_lines=None): """ Read the sheet into a list of lists of strings """ self.__ss_check() if num_lines is not None: q = DocumentQuery(params={'max-results':'%d'%num_lines}) list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id,query=q) else: list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id) string_list = [] for i, entry in enumerate(list_feed.entry): row = {key:entry.custom[key].text for key in entry.custom} row['rowname'] = entry.title.text string_list.append(row) return string_list def read_to_dict(self,key_column_name,row_start=None,row_num=None): """ Read the sheet into a dictionary with keys given by the named column. NOTE: Raises an error on any redundant rows. """ self.__ss_check() # Use this for testing to limit number of results handled: params = {} if row_start is not None: params['start-index'] = '%d'%row_start if row_num is not None: params['max-results'] = '%d'%row_num if len(params) != 0: q = ListQuery(params=params) list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id,query=q) else: list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id) multi_rows = [] string_dict = {} for i, entry in enumerate(list_feed.entry): # If we get the title column, ignore it. if entry.custom[key_column_name].text.replace(' ','').lower() == key_column_name: continue row = {key:entry.custom[key].text for key in entry.custom if key is not key_column_name} row['rowname'] = entry.title.text key_name = entry.custom[key_column_name].text if key_name in string_dict: multi_rows.append(key_name) string_dict[key_name] = row if len(multi_rows) > 0: errors = 'read_to_dict -- Column {} contains multiple rows with each of the following values: {}'.format(key_column_name,multi_rows) # raise GDError('read_to_dict -- Column {} contains multiple rows with each of the following values: {}'.format(key_column_name,multi_rows)) else: errors = None return string_dict,errors