def process_lines(self, lines): """ Convert the given input into a list of SoupString rows for further processing. """ try: from bs4 import BeautifulSoup except ImportError: raise core.OptionalTableImportError( 'BeautifulSoup must be ' 'installed to read HTML tables') if 'parser' not in self.html: soup = BeautifulSoup('\n'.join(lines)) else: # use a custom backend parser soup = BeautifulSoup('\n'.join(lines), self.html['parser']) tables = soup.find_all('table') for i, possible_table in enumerate(tables): if html.identify_table(possible_table, self.html, i + 1): table = possible_table # Find the correct table break else: if isinstance(self.html['table_id'], int): err_descr = 'number {0}'.format(self.html['table_id']) else: err_descr = "id '{0}'".format(self.html['table_id']) raise core.InconsistentTableError( 'ERROR: HTML table {0} not found'.format(err_descr)) self.html['attrs'] = table.attrs # Get all table rows soup_list = [html.SoupString(x) for x in table.find_all('tr')] return soup_list
def __init__(self, tap_url="http://gaia.esac.esa.int/tap-server/tap/g10_smc", table_name=None): logger.debug("tap url: %r", tap_url) self.tap_url = tap_url self.table_name = table_name if table_name is None: # let us try to infer the table name if tap_url.endswith("tap") or tap_url.endswith("tap/"): pass # this mean we really didn't provide one else: index = tap_url.rfind("tap/") if index != -1: self.tap_url, self.table_name = tap_url[:index+4], self.tap_url[index+4:] logger.debug("inferred url is %s, and table name is %s", self.tap_url, self.table_name) if self.tap_url.startswith("tap+"): # remove tap+ part from tap+http(s), only keep http(s) part self.tap_url = self.tap_url[len("tap+"):] import requests super(DatasetTap, self).__init__(self.table_name) self.req = requests.request("get", self.tap_url+"/tables/") self.path = "tap+" +self.tap_url + "/" + table_name #print dir(self.req) from bs4 import BeautifulSoup #self.soup = BeautifulSoup(req.response) tables = BeautifulSoup(self.req.content, 'xml') self.tap_tables = collections.OrderedDict() for table in tables.find_all("table"): #print table.find("name").string, table.description.string, table["gaiatap:size"] table_name = unicode(table.find("name").string) table_size = int(table["esatapplus:size"]) #print table_name, table_size logger.debug("tap table %r ", table_name) columns = [] for column in table.find_all("column"): column_name = unicode(column.find("name").string) column_type = unicode(column.dataType.string) ucd = column.ucd.string if column.ucd else None unit = column.unit.string if column.unit else None description = column.description.string if column.description else None #print "\t", column_name, column_type, ucd #types.add() columns.append((column_name, column_type, ucd, unit, description)) self.tap_tables[table_name] = (table_size, columns) if not self.tap_tables: raise ValueError("no tables or wrong url") for name, (table_size, columns) in self.tap_tables.items(): logger.debug("table %s has length %d", name, table_size) self._full_length, self._tap_columns = self.tap_tables[self.table_name] self._length = self._full_length logger.debug("selected table table %s has length %d", self.table_name, self._full_length) #self.column_names = [] #self.columns = collections.OrderedDict() for column_name, column_type, ucd, unit, description in self._tap_columns: logger.debug(" column %s has type %s and ucd %s, unit %s and description %s", column_name, column_type, ucd, unit, description) if column_type in self.type_map.keys(): self.column_names.append(column_name) if ucd: self.ucds[column_name] = ucd if unit: self.units[column_name] = unit if description: self.descriptions[column_name] = description self.columns[column_name] = self.TapColumn(self, column_name, column_type, ucd) else: logger.warning(" type of column %s is not supported, it will be skipped", column_name)