Пример #1
0
    def process_lines(self, lines):
        """
        Convert the given input into a list of SoupString rows
        for further processing.
        """

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise core.OptionalTableImportError(
                'BeautifulSoup must be '
                'installed to read HTML tables')

        if 'parser' not in self.html:
            soup = BeautifulSoup('\n'.join(lines))
        else:  # use a custom backend parser
            soup = BeautifulSoup('\n'.join(lines), self.html['parser'])
        tables = soup.find_all('table')
        for i, possible_table in enumerate(tables):
            if html.identify_table(possible_table, self.html, i + 1):
                table = possible_table  # Find the correct table
                break
        else:
            if isinstance(self.html['table_id'], int):
                err_descr = 'number {0}'.format(self.html['table_id'])
            else:
                err_descr = "id '{0}'".format(self.html['table_id'])
            raise core.InconsistentTableError(
                'ERROR: HTML table {0} not found'.format(err_descr))

        self.html['attrs'] = table.attrs
        # Get all table rows
        soup_list = [html.SoupString(x) for x in table.find_all('tr')]

        return soup_list
Пример #2
0
	def __init__(self, tap_url="http://gaia.esac.esa.int/tap-server/tap/g10_smc", table_name=None):
		logger.debug("tap url: %r", tap_url)
		self.tap_url = tap_url
		self.table_name = table_name
		if table_name is None: # let us try to infer the table name
			if tap_url.endswith("tap") or tap_url.endswith("tap/"):
				pass # this mean we really didn't provide one
			else:
				index = tap_url.rfind("tap/")
				if index != -1:
					self.tap_url, self.table_name = tap_url[:index+4], self.tap_url[index+4:]
					logger.debug("inferred url is %s, and table name is %s", self.tap_url, self.table_name)

		if self.tap_url.startswith("tap+"): # remove tap+ part from tap+http(s), only keep http(s) part
			self.tap_url = self.tap_url[len("tap+"):]
		import requests
		super(DatasetTap, self).__init__(self.table_name)
		self.req = requests.request("get", self.tap_url+"/tables/")
		self.path = "tap+" +self.tap_url + "/" + table_name

		#print dir(self.req)
		from bs4 import BeautifulSoup
			#self.soup = BeautifulSoup(req.response)
		tables = BeautifulSoup(self.req.content, 'xml')
		self.tap_tables = collections.OrderedDict()
		for table in tables.find_all("table"):
			#print table.find("name").string, table.description.string, table["gaiatap:size"]
			table_name = unicode(table.find("name").string)
			table_size = int(table["esatapplus:size"])
			#print table_name, table_size
			logger.debug("tap table %r ", table_name)
			columns = []
			for column in table.find_all("column"):
				column_name = unicode(column.find("name").string)
				column_type = unicode(column.dataType.string)
				ucd = column.ucd.string if column.ucd else None
				unit = column.unit.string if column.unit else None
				description = column.description.string if column.description else None
				#print "\t", column_name, column_type, ucd
				#types.add()
				columns.append((column_name, column_type, ucd, unit, description))
			self.tap_tables[table_name] = (table_size, columns)
		if not self.tap_tables:
			raise ValueError("no tables or wrong url")
		for name, (table_size, columns) in self.tap_tables.items():
			logger.debug("table %s has length %d", name, table_size)
		self._full_length, self._tap_columns = self.tap_tables[self.table_name]
		self._length = self._full_length
		logger.debug("selected table table %s has length %d", self.table_name, self._full_length)
		#self.column_names = []
		#self.columns = collections.OrderedDict()
		for column_name, column_type, ucd, unit, description in self._tap_columns:
			logger.debug("  column %s has type %s and ucd %s, unit %s and description %s", column_name, column_type, ucd, unit, description)
			if column_type in self.type_map.keys():
				self.column_names.append(column_name)
				if ucd:
					self.ucds[column_name] = ucd
				if unit:
					self.units[column_name] = unit
				if description:
					self.descriptions[column_name] = description
				self.columns[column_name] = self.TapColumn(self, column_name, column_type, ucd)
			else:
				logger.warning("  type of column %s is not supported, it will be skipped", column_name)
Пример #3
0
	def __init__(self, tap_url="http://gaia.esac.esa.int/tap-server/tap/g10_smc", table_name=None):
		logger.debug("tap url: %r", tap_url)
		self.tap_url = tap_url
		self.table_name = table_name
		if table_name is None: # let us try to infer the table name
			if tap_url.endswith("tap") or tap_url.endswith("tap/"):
				pass # this mean we really didn't provide one
			else:
				index = tap_url.rfind("tap/")
				if index != -1:
					self.tap_url, self.table_name = tap_url[:index+4], self.tap_url[index+4:]
					logger.debug("inferred url is %s, and table name is %s", self.tap_url, self.table_name)

		if self.tap_url.startswith("tap+"): # remove tap+ part from tap+http(s), only keep http(s) part
			self.tap_url = self.tap_url[len("tap+"):]
		import requests
		super(DatasetTap, self).__init__(self.table_name)
		self.req = requests.request("get", self.tap_url+"/tables/")
		self.path = "tap+" +self.tap_url + "/" + table_name

		#print dir(self.req)
		from bs4 import BeautifulSoup
			#self.soup = BeautifulSoup(req.response)
		tables = BeautifulSoup(self.req.content, 'xml')
		self.tap_tables = collections.OrderedDict()
		for table in tables.find_all("table"):
			#print table.find("name").string, table.description.string, table["gaiatap:size"]
			table_name = unicode(table.find("name").string)
			table_size = int(table["esatapplus:size"])
			#print table_name, table_size
			logger.debug("tap table %r ", table_name)
			columns = []
			for column in table.find_all("column"):
				column_name = unicode(column.find("name").string)
				column_type = unicode(column.dataType.string)
				ucd = column.ucd.string if column.ucd else None
				unit = column.unit.string if column.unit else None
				description = column.description.string if column.description else None
				#print "\t", column_name, column_type, ucd
				#types.add()
				columns.append((column_name, column_type, ucd, unit, description))
			self.tap_tables[table_name] = (table_size, columns)
		if not self.tap_tables:
			raise ValueError("no tables or wrong url")
		for name, (table_size, columns) in self.tap_tables.items():
			logger.debug("table %s has length %d", name, table_size)
		self._full_length, self._tap_columns = self.tap_tables[self.table_name]
		self._length = self._full_length
		logger.debug("selected table table %s has length %d", self.table_name, self._full_length)
		#self.column_names = []
		#self.columns = collections.OrderedDict()
		for column_name, column_type, ucd, unit, description in self._tap_columns:
			logger.debug("  column %s has type %s and ucd %s, unit %s and description %s", column_name, column_type, ucd, unit, description)
			if column_type in self.type_map.keys():
				self.column_names.append(column_name)
				if ucd:
					self.ucds[column_name] = ucd
				if unit:
					self.units[column_name] = unit
				if description:
					self.descriptions[column_name] = description
				self.columns[column_name] = self.TapColumn(self, column_name, column_type, ucd)
			else:
				logger.warning("  type of column %s is not supported, it will be skipped", column_name)