def __init__(self, identity, text, url, harvest_details): self.text = text self.identity = identity self.url = url self.harvest_details = harvest_details # parse self.parser = Parser(text) self.parse()
def process_response(self, data): ''' data here is just the content from the cleaned result set strip punctuation (modified version) tokenize strip stopwords ''' def _strip_punctuation(text, simple_pattern=r'[;|>+:=#@%<?(){}`\'"]'): text = re.sub(simple_pattern, ' ', text) return text.replace("/", ' ') content = data['content'] if self.include_structure: # include the xml tags, etc # note: this uses a different punctuation set bow = _strip_punctuation(content) # so this runs without error in ipy but not here. # TODO: fix that words = tokenize(bow) words = remove_stopwords(words) return words else: # pull out the text only parser = Parser(content) all_text = parser.find_nodes() # collapse to just text and attributes.text values bow = '' bow += ' '.join([a.get('text' '') for a in all_text]) atts = [a.get('attributes', []) for a in all_text] bow += ' '.join([a.get('text' '') for a in atts]) bow = remove_punctuation(bow) words = tokenize_text(bow) words = remove_stopwords(words) if len(words) < self.minimum_wordcount: return '' return ' '.join(words)
def create(self, doc): # the sha (generated from url if not in doc) self.source_url_sha = doc.get('url_hash') # TODO: verify that the pipeline output contains this self.source_url = doc.get('url') # this should come from the cleaned output of # the pipeline so that we don't need to keep # dealing with really junky strings # TODO: switch to pipeline clean content # cleaned_content = self._clean(doc.get('raw_content', '')) # try: # parser = Parser(cleaned_content) # cleaned_content = etree.tostring(parser.xml, pretty_print=True) # fmt = 'xml' # except Exception as ex: # print 'xml error', ex # try: # clean_json = json.loads(cleaned_content) # fmt = 'json' # except: # fmt = 'unknown' fmt = doc.get('response_datatype', 'unknown') cleaned_content = doc.get('content') if fmt == 'xml': parser = Parser(cleaned_content.encode('utf-8')) try: self.namespaces = parser.namespaces except Exception as ex: print 'namespace error', ex try: self.schemas = self._pull_schemas(parser.xml) except Exception as ex: print 'schema error', ex traceback.print_exc() self.format = fmt self.cleaned_content = cleaned_content self.raw_content = doc.get('raw_content', '') self.raw_content_md5 = doc.get('digest', '') self.initial_harvest_date = doc.get('tstamp') self.host = doc.get('host', '') self.inlinks = doc.get('inlinks', []) self.outlinks = doc.get('outlinks', []) headers = doc.get('response_headers', []) self.headers = dict( ( k.strip(), v.strip()) for k, v in (h.split(':', 1) for h in headers) )
def __init__(self, yaml_files, source_content, source_url, **options): ''' **options: parser: Parser from source_content ignore_case: bool ''' self.yaml_files = yaml_files self.source_content = source_content self.source_url = source_url self.yaml = import_yaml_configs(self.yaml_files) self.parser = Parser(source_content)
def create(self, doc): # the sha (generated from url if not in doc) self.source_url_sha = doc.get('url_hash') # TODO: verify that the pipeline output contains this self.source_url = doc.get('url') fmt = doc.get('response_datatype', 'unknown') cleaned_content = doc.get('content') if fmt == 'xml': parser = Parser(cleaned_content.encode('utf-8')) if parser.xml is None: print self.source_url_sha, self.source_url fmt = 'xml;unparsed' # try: # self.namespaces = parser.namespaces # except Exception as ex: # print 'namespace error', ex # try: # self.schemas = self._pull_schemas(parser.xml) # except Exception as ex: # print 'schema error', ex # traceback.print_exc() self.format = fmt self.cleaned_content = cleaned_content self.raw_content = doc.get('raw_content', '') self.raw_content_md5 = doc.get('digest', '') self.initial_harvest_date = doc.get('tstamp') self.host = doc.get('host', '') self.inlinks = doc.get('inlinks', []) self.outlinks = doc.get('outlinks', []) headers = doc.get('response_headers', []) self.headers = dict((k.strip(), v.strip()) for k, v in (h.split(':', 1) for h in headers)) self.response_identity = next(iter(doc.get('identity', [])), {})
def process_response(self, data): # do the response processing content = data['content'].encode('unicode_escape') parser = Parser(content) return parser.to_string()
def _load_xml(self): self.parser = Parser(self._response)
def _instantiate(self, identity, response, url, harvest_details): ''' set up the router ''' # TODO: add a filter for known protocol with type # data and bail identity = next(iter(identity), {}) protocol = identity.get('protocol', '') if protocol and protocol in self.optional_params.get( 'ignore_protocols', []): return None # remap for the reader names without "Reader" _remap = { "OAI-PMH": "OaiPmh", "OGC": "Ogc", "UNIDATA": "Thredds", "ISO": "Iso", "RDF": "Rdf", "FGDC": "FgdcItem" } # this is bad naming, but if the protocol value is # a key, get the value otherwise just hang onto it protocol = _remap.get(protocol, protocol) # we're going to pull the class name trick for the set # that is straightforward and hope not many are wonky protocol = 'Xml' if self.optional_params.get('parse_as_xml', False) \ and not protocol else protocol # see if it's a loaded object try: reader_class = getattr(sys.modules[__name__], protocol + 'Reader') except AttributeError: # if it's not, we can't parse anyway return None if reader_class.__name__ in [ 'OpenSearchReader', 'OaiPmhReader', 'ThreddsReader', 'IsoReader', 'XmlReader', 'OgcReader', 'RdfReader']: # it's the standard processor init return reader_class( identity, response, url, harvest_details ) elif reader_class.__name__ == 'FgdcItemReader': # TODO: this is baked into the others, # we should take care of that parser = Parser(response) # TODO: don't forget to handle the harvest date return reader_class(parser.xml, url, harvest_details) return None
def __init__(self, text, handle_html=False, include_html_hrefs=False): self.text = text self.parser = Parser(text) self.handle_html = handle_html self.include_html_hrefs = include_html_hrefs