def deduplicate_sentences(raws): logger = logging.getLogger(sys._getframe().f_code.co_name) #logger.debug('deduplicate_sentences') sentences = raws.split(u'.') sentences = [normalize_space(sentence) for sentence in sentences if len(normalize_space(sentence))>0] good_sentences = [] prev_sentence = u'' for sentence in sentences: #logger.debug(u' checking: "{0}"'.format(sentence)) if prev_sentence == sentence: #logger.debug(u' DUPLICATE: IGNORED') pass elif len(prev_sentence) > 0 and len(prev_sentence) < len(sentence): #logger.debug(u' checking if "{0}" starts with "{1}"'.format(sentence, prev_sentence)) foo = sentence[:len(prev_sentence)] #logger.debug(u' foo: {0}'.format(foo)) if foo == prev_sentence: #logger.debug(' STARTS WITH: PREVIOUS REMOVED') good_sentences = good_sentences[0:-1] good_sentences.append(sentence) else: #logger.debug(u' KEEP!') good_sentences.append(sentence) prev_sentence = sentence #logger.debug('good sentences follow') #for sentence in good_sentences: #logger.debug(u' {0}'.format(sentence)) return u'. '.join(good_sentences)
def _get_language(self, *args): logger = logging.getLogger(sys._getframe().f_code.co_name) chunks = [chunk for chunk in args if chunk is not None] s = u' '.join((tuple(chunks))) s = normalize_space(s) logger.debug('s: \n"{}\n'.format(s.encode('utf-8'))) if s != u'': language = LANGUAGE_IDENTIFIER.classify(s) logger.debug(repr(language)) if language[1] >= LANGID_THRESHOLD: return language[0] return None
def _parse_peeps(self, rx_list, content_text): cooked = [] raw = u'' for rx in rx_list: m = rx.search(content_text) if m: raw = m.groups()[-1] break if len(raw) > 0: if u',' in raw: cracked = raw.split(u',') else: cracked = [raw,] for chunk in cracked: if u' and ' in chunk: cooked.extend(chunk.split(u' and ')) else: cooked.append(chunk) cooked = [normalize_space(peep) for peep in cooked if len(normalize_space(peep)) > 0] return cooked
def deduplicate_lines(raws): #logger = logging.getLogger(sys._getframe().f_code.co_name) #logger.debug('\n\ndeduplicating!') prev_line = u'' good_lines = [] cookeds = u'' lines = raws.split(u'\n') lines = [normalize_space(line) for line in lines if normalize_space(line) != u''] for line in lines: #logger.debug(u'prev_line: {0}'.format(prev_line)) #logger.debug(u'line: {0}'.format(line)) canary = RX_CANARY.sub(u'', line.lower()) #logger.debug(u'canary: {0}'.format(canary)) if canary != u'': prev_length = len(prev_line) if prev_line != u'' and prev_length < len(canary): toucan = unicode(canary[:prev_length]) else: toucan = u'' #logger.debug(u'toucan: {0}'.format(toucan)) if prev_line == u'': good_lines.append(line) #logger.debug('append initial!') elif toucan == prev_line: good_lines = good_lines[0:-1] #logger.debug('clawback!') good_lines.append(line) elif canary != prev_line: #logger.debug('append!') good_lines.append(line) else: #logger.debug('NEIN!') pass else: good_lines.append(line) prev_line = canary #logger.debug('good_lines follows') #for line in good_lines: #logger.debug(u' {0}'.format(line)) return normalize_space(u' '.join(good_lines))
def clean_string(raw): prepped = normalize_space(raw) if prepped == u'': return u'' chopped = prepped.split(u'.') if len(chopped) > 2: cooked = u'.'.join(tuple(chopped[:2])) i = 2 #while i < len(chopped) and len(cooked) < 40: why truncation? while i < len(chopped): cooked = cooked + u'.' + chopped[i] i = i + 1 else: cooked = prepped junk = [ (u'(', u')'), (u'[', u']'), (u'{', u'}'), (u'"', u'"'), (u"'", u"'"), (u'<', u'>'), (u'«', u'»'), (u'‘', u'’'), (u'‚', u'‛'), (u'“', u'”'), (u'‟', u'„'), (u'‹', u'›'), (u'〟', u'"'), (u'\\'), (u'/'), (u'|'), (u','), (u';'), (u'-'), (u'.'), (u'_'), ] for j in junk: try: if len(j) == 2: cooked = cooked[1:-1] if cooked[0] == j[0] and cooked[-1] == j[1] else cooked else: cooked = cooked[1:] if cooked[0] == j[0] else cooked cooked = cooked[:-1] if cooked[-1] == j[0] else cooked if cooked[0:4] == u'and ': cooked = cooked[4:] except IndexError: pass else: cooked = cooked.strip() return cooked
def ukey(raw): raw_type = type(raw) if raw_type == list: uraw = u' '.join([unicode(chunk) for chunk in raw]) elif raw_type == str: uraw = str(raw) elif raw_type == unicode: uraw = raw else: raise TypeError(u'ukey does not support arguments of type {0}'.format(raw_type)) cooked = normalize_space(uraw) cooked = RX_PUNCTSTRIP.sub(u'', cooked) cooked = cooked.lower().split() cooked = list(set(cooked)) cooked = u''.join(cooked) return cooked
def _nodesplain(self, node, gloss=u'', include_source=False): """Provide copious information about this XML node.""" logger = logging.getLogger(sys._getframe().f_code.co_name) template = u""" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ >>> NODESPLANATION <<< ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ type: {node_type} name: {name} xpath: /{xpath} attributes: {attributes} text: {text} gloss: {gloss} source: {source} ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ name = node.name try: text = normalize_space(u' '.join([string for string in node.stripped_strings])) except AttributeError: text = u'None' try: attributes = pprint.pformat(node.attrs) except AttributeError: attributes = u'None' count = str(1+len([t for t in node.previous_siblings if t.name == name])) path = ['{name}[{count}]'.format(name=name, count=count)] for parent in node.parents: if type(parent) != NavigableString: parent_name = parent.name count = str(1+len([t for t in parent.previous_siblings if t.name == parent_name])) path = ['{name}[{count}]'.format(name=parent_name, count=count)] + path root = [p for p in node.parents][1] params = { 'node_type': type(node), 'name': name, 'xpath': '/'.join(path), 'attributes': attributes, 'text': text, 'gloss': gloss, 'source': root.prettify() if include_source else u'' } return template.format(**params)
def _clean_keywords(self, raw_tags): tags = list(set(raw_tags)) keywords = [] for tag in tags: if tag == u'': pass elif u',' in tag: keywords.extend(tag.split(u',')) else: keywords.append(tag) keywords = sorted([normalize_space(kw) for kw in list(set(keywords))], key=lambda s: s.lower()) for tag in keywords: if tag == tag.upper(): pass elif tag.lower() in TITLE_SUBSTRING_TAGS.keys(): pass elif tag != tag.lower(): raise ValueError(u'keyword "{0}" lacks an appropriate entry in awol_title_strings.csv'.format(tag)) return list(set(keywords))
from isaw.awol.normalize_space import normalize_space from isaw.awol.resource import Resource PATH_CURRENT = os.path.dirname(os.path.abspath(__file__)) # Build a dictionary of format {<colon prefix>:<list of cols 2,3 and 4>} colon_prefix_csv = pkg_resources.resource_stream('isaw.awol', 'awol_colon_prefixes.csv') dreader = unicodecsv.DictReader( colon_prefix_csv, fieldnames=['col_pre', 'omit_post', 'strip_title', 'mul_res'], delimiter=',', quotechar='"') COLON_PREFIXES = dict() for row in dreader: COLON_PREFIXES.update({ normalize_space(row['col_pre']).lower(): [row['omit_post'], row['strip_title'], row['mul_res']] }) del dreader DOMAINS_TO_IGNORE = ['draft.blogger.com'] DOMAINS_SECONDARY = ['ancientworldonline.blogspot.com'] LANGID_THRESHOLD = 0.95 RX_CANARY = re.compile(r'[\.,:!\"“„\;\-\s]+', re.IGNORECASE) RX_NUMERICISH = re.compile(r'^a?n?d?\s*[\.,:!\"“„\;\-\s\d\(\)\[\]]+$', re.IGNORECASE) RX_MATCH_DOMAIN = re.compile('^https?:\/\/([^/#]+)') RX_IDENTIFIERS = { 'issn': { 'electronic': [ re.compile( r'(electronic|e-|e‒|e–|e—|e|online|on-line|digital)([\s:]*issn[^\d]*[\dX-‒–—]{4}[-‒–—\s]?[\dX]{4})',
def digdigdig(this_node, first_node, stop_tags, skip_first_anchor, previous_urls): node_type = type(this_node) node_name = this_node.name try: node_url = this_node.get('href') except AttributeError: node_url = '' if node_url is None: node_url = '' if '/' in node_url: chunks = node_url.split('/') if chunks[-1] in ['index.html', 'index.php', '', None]: node_url = '/'.join(chunks[:-1]) results = [] if ( this_node != first_node and node_name in stop_tags and ( node_name != 'a' or ( 'a' in stop_tags and node_name == 'a' and ( ( skip_first_anchor and len(previous_urls) == 0 ) or ( not(skip_first_anchor) and len(previous_urls) > 0 and node_url != previous_urls[-1] ) ) ) ) ): return (True, results) if node_name == 'a': previous_urls.append(node_url) try: previous_text = normalize_space(this_node.previous_sibling.get_text()) except AttributeError: previous_text = u'' try: previous_last = previous_text[-1] except IndexError: previous_last = previous_text if node_name == 'br' and previous_last != u'.': results.append(u'. ') if node_type == NavigableString: results.append(unicode(this_node)) else: try: descendants = this_node.descendants except AttributeError: pass else: if descendants is not None: for child in this_node.children: stop, child_results = digdigdig(child, first_node, stop_tags, skip_first_anchor, previous_urls) results.extend(child_results) if stop: return (stop, results) return (False, results)
def _get_description(self, context=None, title=u''): if context is None: c = self.content soup = c['soup'] first_node = soup.body.contents[0] skip_first_anchor = True else: first_node = context skip_first_anchor = False def digdigdig(this_node, first_node, stop_tags, skip_first_anchor, previous_urls): node_type = type(this_node) node_name = this_node.name try: node_url = this_node.get('href') except AttributeError: node_url = '' if node_url is None: node_url = '' if '/' in node_url: chunks = node_url.split('/') if chunks[-1] in ['index.html', 'index.php', '', None]: node_url = '/'.join(chunks[:-1]) results = [] if ( this_node != first_node and node_name in stop_tags and ( node_name != 'a' or ( 'a' in stop_tags and node_name == 'a' and ( ( skip_first_anchor and len(previous_urls) == 0 ) or ( not(skip_first_anchor) and len(previous_urls) > 0 and node_url != previous_urls[-1] ) ) ) ) ): return (True, results) if node_name == 'a': previous_urls.append(node_url) try: previous_text = normalize_space(this_node.previous_sibling.get_text()) except AttributeError: previous_text = u'' try: previous_last = previous_text[-1] except IndexError: previous_last = previous_text if node_name == 'br' and previous_last != u'.': results.append(u'. ') if node_type == NavigableString: results.append(unicode(this_node)) else: try: descendants = this_node.descendants except AttributeError: pass else: if descendants is not None: for child in this_node.children: stop, child_results = digdigdig(child, first_node, stop_tags, skip_first_anchor, previous_urls) results.extend(child_results) if stop: return (stop, results) return (False, results) def skiptomalou(first_node, stop_tags, skip_first_anchor): previous_urls = [] stop, desc_lines = digdigdig(first_node, first_node, stop_tags, skip_first_anchor, previous_urls) node = first_node while True: previous_node = node node = node.next_sibling if node is None: break try: node_name = node.name except AttributeError: node_name = type(node) try: node_url = node.get('href') except AttributeError: node_url = '' if node_url is None: node_url = '' if '/' in node_url: chunks = node_url.split('/') if chunks[-1] in ['index.html', 'index.php', '', None]: node_url = '/'.join(chunks[:-1]) if ( node_name in stop_tags and ( node_name != 'a' or ( 'a' in stop_tags and node_name == 'a' and ( ( not(skip_first_anchor) and len(previous_urls) == 0 ) or ( skip_first_anchor and len(previous_urls) > 0 and node_url != previous_urls[-1] ) ) ) ) ): break if node_name == 'a': previous_urls.append(node_url) stop, results = digdigdig(node, first_node, stop_tags, skip_first_anchor, previous_urls) desc_lines.extend(results) if stop: break return desc_lines stop_tags = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'ol', 'ul', 'dl', 'dt', 'li', 'table'] desc_lines = skiptomalou(first_node, stop_tags, skip_first_anchor) stop_tags = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] if len(desc_lines) == 0: desc_lines = skiptomalou(first_node, stop_tags, False) elif ukey(desc_lines) == ukey(title): desc_lines = skiptomalou(first_node, stop_tags, skip_first_anchor) if len(desc_lines) == 0: desc_text = None else: desc_text = deduplicate_lines(u'\n'.join(desc_lines)) desc_text = u''.join(desc_lines) if len(desc_text) == 0: desc_text = None else: desc_text = desc_text.replace(u'%IMAGEREPLACED%', u'').strip() desc_text = RX_PUNCT_FIX.sub(r'\1', desc_text) desc_text = deduplicate_sentences(desc_text) desc_text = RX_PUNCT_DEDUPE.sub(r'\1', desc_text) desc_text = normalize_space(desc_text) if len(desc_text) == 0: desc_text = None elif desc_text[-1] != u'.': desc_text += u'.' return desc_text
def _get_resources(self, article): """Override basic resource extraction.""" if article.url == u'http://ancientworldonline.blogspot.com/2015/05/universitat-wien-theses-and.html': resources = [] relatives = [] soup = article.soup people = soup.find_all('span', 'person_name') for person in people: a = person.find_next_sibling('a') foo = a.find_next_sibling('br').next_sibling try: bar = a.find_next_sibling('span', 'person_name').previous_sibling except AttributeError: bar = None description = u'' while foo is not None and foo != bar: if type(foo) == NavigableString: description += u'{0} '.format( clean_string(unicode(foo))) else: description += u'{0} '.format( clean_string(foo.get_text())) foo = foo.next_sibling if description.strip() == u'': description = None else: description = normalize_space(u'. '.join( [chunk.strip() for chunk in description.split(u'.')])) foosball = a.find_all_previous('a') foosball = [f for f in foosball if 'subjects' in f.get('href')] if len(foosball) > 0: f = foosball[0] params = { 'domain': domain_from_url(f.get('href')), 'keywords': self._parse_keywords( resource_title=clean_string(f.get_text())), 'languages': self._get_language(clean_string(f.get_text())), 'title': clean_string(f.get_text()), 'url': f.get('href') } rr = self._make_resource(**params) self._set_provenance(rr, article) relatives.append(rr) params = { 'authors': [ clean_string(person.get_text()), ], 'description': description, 'domain': domain_from_url(a.get('href')), 'keywords': self._parse_keywords(post_title=rr.title, resource_title=clean_string( a.get_text())), 'languages': self._get_language(clean_string(a.get_text())), 'title': clean_string(a.get_text()), 'url': a.get('href'), 'year': clean_string(unicode(person.next_sibling)), } resource = self._make_resource(**params) resource.related_resources.append(rr.package()) self._set_provenance(resource, article) resources.append(resource) relative_urls = list(set([r.url for r in relatives])) unique_relatives = [] for rurl in relative_urls: unique_relatives.append( [r for r in relatives if r.url == rurl][0]) return resources + unique_relatives else: return AwolDomainParser._get_resources(self, article)
def _load_atom(self, atom_file_name): """Open atom file and parse for basic info. We attempt to set the following attributes on the class: * id (string): tag id for this atom entry * title (unicode): title of the original blog post * url (string): url for the original blog post) * categories (dictionary) with the following keys: * 'vocabulary' (string): captures "scheme" from the entry categories * 'term' (string): verbatim from the entry categories * content (unicode): normalized unicode string containing everything that was in the entry content (see normalization comments below) * soup (bs4 BeutifulSoup object): html-parsed version of content All strings are space normalized (i.e., all continguous spans of whitespace are collapsed to a single space and the result string is stripped of leading and trailing whitespace). The normalization form of all unicode strings (title and content) are converted to Normalization Form "C" (canonical normalized). """ logger = logging.getLogger(sys._getframe().f_code.co_name) with open(atom_file_name, 'r') as file_object: self.doc = exml.parse(file_object) self.root = self.doc.getroot() root = self.root self.id = root.find('{http://www.w3.org/2005/Atom}id').text.strip() #logger.debug('article id: "{0}"'.format(self.id)) # title of blog post should be same as title of atom entry raw_title = str(root.find('{http://www.w3.org/2005/Atom}title').text) try: self.title = purify_text( normalize_space(unicodedata.normalize('NFC', raw_title))) except TypeError: msg = 'could not extract blog post title for article with id: "{0}"'.format( self.id) raise RuntimeWarning(msg) else: #logger.debug(u'article title: "{0}"'.format(self.title)) pass # get url of blog post (html alternate) try: raw_url = str( root.xpath("//*[local-name()='link' and @rel='alternate']") [0].get('href')) except IndexError: msg = 'could not extract blog post URL for article with id: "{0}"'.format( self.id) raise RuntimeError(msg) else: try: raw_url = normalize_space(unicodedata.normalize( 'NFC', raw_url)) except TypeError: msg = 'could not normalize blog post URL for article with id: "{0}"'.format( self.id) raise RuntimeError(msg) else: if urls.valid(raw_url): self.url = raw_url else: msg = 'invalid blog post URL ({0}) for article with id: "{1}"'.format( raw_url, self.id) raise RuntimeError(msg) # capture categories as vocabulary terms self.categories = [{ 'vocabulary': c.get('scheme'), 'term': normalize_space(unicodedata.normalize('NFC', str(c.get('term')))) } for c in root.findall('{http://www.w3.org/2005/Atom}category')] # extract content, normalize, and parse as HTML for later use raw_content = root.find('{http://www.w3.org/2005/Atom}content').text soup = BeautifulSoup( raw_content, 'lxml') # mainly to convert character entities to unicode soup_content = str(soup) del soup content = unicodedata.normalize('NFC', soup_content) del soup_content content = normalize_space(content) content = purify_html( content) # get rid of all manner of evil, stupid stuff self.content = content try: html = exml.fromstring(content, XML_PARSER) except XMLSyntaxError: msg = 'XMLSyntaxError while trying to parse content of {0}; trying html5lib parser with BeautifulSoup and then lxml parser with recover=True'.format( atom_file_name) logger.warning(msg) soup = BeautifulSoup(raw_content, 'html5lib') soup_content = str(soup) del soup content = unicodedata.normalize('NFC', soup_content) del soup_content content = normalize_space(content) content = purify_html(content) self.content = content try: html = exml.fromstring(content, XML_PARSER_LENIENT) except XMLSyntaxError: msg = 'XMLSyntaxError while trying to re-parse content of {0} using html5lib parser with BeautifulSoup'.format( atom_file_name) logger.error(msg) logger.error(content) sys.exit(-1000) #logger.debug('normalized html:\n\n' + exml.tostring(html, pretty_print=True)) transform = exml.XSLT(XSL_CLEANUP) clean_html = transform(html) #logger.debug('cleaned html:\n\n' + exml.tostring(clean_html, pretty_print=True)) self.soup = BeautifulSoup(exml.tostring(clean_html), 'lxml')