def extract_rows(self, *args, **kwargs): """ Row data extraction for extract_tabular """ result_list = [] result = kwargs.get('result', {}) try: sel = cssselect.CSSSelector(kwargs.get('selector', '')) values = sel(self.tree) if len(kwargs.get('table_headers', [])) >= len(values): from itertools import izip_longest pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', '')) else: from itertools import izip pairs = izip(kwargs.get('table_headers', []), values) for head, val in pairs: if kwargs.get('verbosity', 0) > 1: print("\nExtracting", head, "attribute", sep=' ', end='') if kwargs.get('attr', 'text') == "text": try: content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()]) except Exception: content = kwargs.get('default', '') content = content.replace("\n", " ").strip() else: content = val.get(kwargs.get('attr', 'text')) if kwargs.get('attr', 'text') in ["href", "src"]: content = urljoin(self.url, content) result[head] = content result_list.append(result) except TypeError: raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', '')) return result_list
def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ Row data extraction for extract_tabular """ result_list = [] try: values = self.get_tree_tag(selector) if len(table_headers) >= len(values): from itertools import izip_longest pairs = izip_longest(table_headers, values, fillvalue=default) else: from itertools import izip pairs = izip(table_headers, values) for head, val in pairs: if verbosity > 1: print("\nExtracting", head, "attribute", sep=' ', end='') if attr.lower() == "text": try: content = connector.join([make_ascii(x).strip() for x in val.itertext()]) except Exception: content = default content = content.replace("\n", " ").strip() else: content = val.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) result[head] = content result_list.append(result) except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
def extract_columns(self, *args, **kwargs): """ Column data extraction for extract_tabular """ result_list = [] result = kwargs.get('result', {}) try: if type(kwargs.get('selector', '')) in [str, unicode]: selectors = [kwargs.get('selector', '')] elif type(kwargs.get('selector', '')) == list: selectors = kwargs.get('selector', '') else: raise Exception( "Use a list of selector expressions for the various columns" ) from itertools import izip, count pairs = izip(kwargs.get('table_headers', []), selectors) columns = {} for head, selector in pairs: columns[head] = self.tree.xpath(selector) try: for i in count(start=0): r = result.copy() for head in columns.keys(): if kwargs.get('verbosity', 0) > 1: print("\nExtracting", head, "attribute", sep=' ', end='') col = columns[head][i] if kwargs.get('attr', 'text') == "text": try: content = kwargs.get('connector', '').join([ make_ascii(x).strip() for x in col.itertext() ]) except Exception: content = kwargs.get('default', '') content = content.replace("\n", " ").strip() else: content = col.get(kwargs.get('attr', 'text')) if kwargs.get('attr', 'text') in ["href", "src"]: content = urljoin(self.url, content) r[head] = content result_list.append(r) except IndexError: pass except XPathError: raise Exception("Invalid XPath selector " + selector) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs): """ Method for performing the content extraction for the particular selector type. \ If the selector is "url", the URL of the current web page is returned. Otherwise, the selector expression is used to extract content. The particular \ attribute to be extracted ("text", "href", etc.) is specified in the method \ arguments, and this is used to extract the required content. If the content \ extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ to convert the relative path into an absolute path. If the selector does not fetch any content, the default value is returned. \ If no default value is specified, an exception is raised. :param selector: The XPath expression :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :param connector: String connector for list of data returned for a particular selector :return: The extracted content """ try: if selector.lower() == "url": return self.url if attr.lower() == "text": tag = self.get_tree_tag(selector=selector, get_one=True) content = connector.join( [make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: tag = self.get_tree_tag(selector=selector, get_one=True) content = tag.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) return content except IndexError: if default is not "": return default raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector)) except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
def extract_content(self, *args, **kwargs): """ Method for performing the content extraction for the given CSS selector. The cssselect library is used to handle CSS selector expressions. \ XPath expressions have a higher speed of execution, so the given CSS selector \ expression is translated into the corresponding XPath expression, by the \ ``cssselect.CSSSelector`` class. This selector can be used to extract content \ from the element tree corresponding to the fetched web page. If the selector is "url", the URL of the current web page is returned. Otherwise, the selector expression is used to extract content. The particular \ attribute to be extracted ("text", "href", etc.) is specified in the method \ arguments, and this is used to extract the required content. If the content \ extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ to convert the relative path into an absolute path. If the selector does not fetch any content, the default value is returned. \ If no default value is specified, an exception is raised. :param selector: The CSS selector expression :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :return: The extracted content """ try: selector, attr, default, connector = [ kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector'] ] if selector == "url": return self.url sel = cssselect.CSSSelector(selector) if attr == "text": tag = sel(self.tree)[0] content = connector.join( [make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: content = sel(self.tree)[0].get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) return content except IndexError: if default is not "": return default raise Exception("There is no content for the selector " + selector)
def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ Row data extraction for extract_tabular """ result_list = [] try: values = self.get_tree_tag(selector) if len(table_headers) >= len(values): from itertools import izip_longest pairs = izip_longest(table_headers, values, fillvalue=default) else: from itertools import izip pairs = izip(table_headers, values) for head, val in pairs: if verbosity > 1: print("\nExtracting", head, "attribute", sep=' ', end='') if attr.lower() == "text": try: content = connector.join( [make_ascii(x).strip() for x in val.itertext()]) except Exception: content = default content = content.replace("\n", " ").strip() else: content = val.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) result[head] = content result_list.append(result) except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
def extract_columns(self, *args, **kwargs): """ Column data extraction for extract_tabular """ result_list = [] result = kwargs.get('result', {}) try: if type(kwargs.get('selector', '')) in [str, unicode]: selectors = [kwargs.get('selector', '')] elif type(kwargs.get('selector', '')) == list: selectors = kwargs.get('selector', '') else: raise Exception("Use a list of selector expressions for the various columns") from itertools import izip, count pairs = izip(kwargs.get('table_headers', []), selectors) columns = {} for head, selector in pairs: sel = cssselect.CSSSelector(selector) columns[head] = sel(self.tree) try: for i in count(start=0): r = result.copy() for head in columns.keys(): if kwargs.get('verbosity', 0) > 1: print("\nExtracting", head, "attribute", sep=' ', end='') col = columns[head][i] if kwargs.get('attr', 'text') == "text": try: content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()]) except Exception: content = kwargs.get('default', '') content = content.replace("\n", " ").strip() else: content = col.get(kwargs.get('attr', 'text')) if kwargs.get('attr', 'text') in ["href", "src"]: content = urljoin(self.url, content) r[head] = content result_list.append(r) except IndexError: pass except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ Column data extraction for extract_tabular """ result_list = [] try: if type(selector) in [str, unicode]: selectors = [selector] elif type(selector) == list: selectors = selector[:] else: raise Exception("Use a list of selector expressions for the various columns") from itertools import izip, count pairs = izip(table_headers, selectors) columns = {} for head, selector in pairs: columns[head] = self.get_tree_tag(selector) try: for i in count(start=0): r = result.copy() for head in columns.keys(): if verbosity > 1: print("\nExtracting", head, "attribute", sep=' ', end='') col = columns[head][i] if attr == "text": try: content = connector.join([make_ascii(x).strip() for x in col.itertext()]) except Exception: content = default content = content.replace("\n", " ").strip() else: content = col.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) r[head] = content result_list.append(r) except IndexError: pass except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
def extract_content(self, *args, **kwargs): """ Method for performing the content extraction for the given CSS selector. The cssselect library is used to handle CSS selector expressions. \ XPath expressions have a higher speed of execution, so the given CSS selector \ expression is translated into the corresponding XPath expression, by the \ ``cssselect.CSSSelector`` class. This selector can be used to extract content \ from the element tree corresponding to the fetched web page. If the selector is "url", the URL of the current web page is returned. Otherwise, the selector expression is used to extract content. The particular \ attribute to be extracted ("text", "href", etc.) is specified in the method \ arguments, and this is used to extract the required content. If the content \ extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ to convert the relative path into an absolute path. If the selector does not fetch any content, the default value is returned. \ If no default value is specified, an exception is raised. :param selector: The CSS selector expression :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :return: The extracted content """ try: selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']] if selector == "url": return self.url sel = cssselect.CSSSelector(selector) if attr == "text": tag = sel(self.tree)[0] content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: content = sel(self.tree)[0].get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) return content except IndexError: if default is not "": return default raise Exception("There is no content for the selector " + selector)
def extract_rows(self, *args, **kwargs): """ Row data extraction for extract_tabular """ result_list = [] result = kwargs.get('result', {}) try: values = self.tree.xpath(kwargs.get('selector', '')) if len(kwargs.get('table_headers', [])) >= len(values): from itertools import izip_longest pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', '')) else: from itertools import izip pairs = izip(kwargs.get('table_headers', []), values) for head, val in pairs: if kwargs.get('verbosity', 0) > 1: print("\nExtracting", head, "attribute", sep=' ', end='') if kwargs.get('attr', 'text') == "text": try: content = kwargs.get('connector', '').join( [make_ascii(x).strip() for x in val.itertext()]) except Exception: content = kwargs.get('default', '') content = content.replace("\n", " ").strip() else: content = val.get(kwargs.get('attr', 'text')) if kwargs.get('attr', 'text') in ["href", "src"]: content = urljoin(self.url, content) result[head] = content result_list.append(result) except XPathError: raise Exception("Invalid XPath selector " + kwargs.get('selector', '')) except TypeError: raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', '')) return result_list
def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs): """ Method for performing the content extraction for the particular selector type. \ If the selector is "url", the URL of the current web page is returned. Otherwise, the selector expression is used to extract content. The particular \ attribute to be extracted ("text", "href", etc.) is specified in the method \ arguments, and this is used to extract the required content. If the content \ extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ to convert the relative path into an absolute path. If the selector does not fetch any content, the default value is returned. \ If no default value is specified, an exception is raised. :param selector: The XPath expression :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :param connector: String connector for list of data returned for a particular selector :return: The extracted content """ try: if selector.lower() == "url": return self.url if attr.lower() == "text": tag = self.get_tree_tag(selector=selector, get_one=True) content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: tag = self.get_tree_tag(selector=selector, get_one=True) content = tag.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) return content except IndexError: if default is not "": return default raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector)) except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ Column data extraction for extract_tabular """ result_list = [] try: if type(selector) in [str, unicode]: selectors = [selector] elif type(selector) == list: selectors = selector[:] else: raise Exception( "Use a list of selector expressions for the various columns" ) from itertools import izip, count pairs = izip(table_headers, selectors) columns = {} for head, selector in pairs: columns[head] = self.get_tree_tag(selector) try: for i in count(start=0): r = result.copy() for head in columns.keys(): if verbosity > 1: print("\nExtracting", head, "attribute", sep=' ', end='') col = columns[head][i] if attr == "text": try: content = connector.join([ make_ascii(x).strip() for x in col.itertext() ]) except Exception: content = default content = content.replace("\n", " ").strip() else: content = col.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) r[head] = content result_list.append(r) except IndexError: pass except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list