def _strip_scales(self, text): # '1:100:000' scale_pttn = ur"(1:[\d]{0,}(,[\d]{3}){1,})" m = match(text, scale_pttn) if m: return '' return text
def _verify_urn(self, potential_urn): # not much point to this but we're just going to leave it pttn = re.compile( ur'^([a-z0-9.#]{0,}:([a-z0-9][a-z0-9.-]{0,31}):[a-z0-9A-Z_()+,-.:=@;$!*\'%/?#\[\]]+)', re.IGNORECASE ) m = match(potential_urn, pttn) return potential_urn if m else ''
def _verify_url(self, potential_url): # ugh if 'mailto:' in potential_url: return '' # is it a urn? pttn = re.compile( ur'^([a-z0-9]{0,}:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*\'%/?#\[\]]+)', re.IGNORECASE ) urn_m = match(potential_url, pttn) if urn_m: return '' # does it even have a scheme? try: u = uparse(potential_url, rule='URI') if not u.get('scheme'): # will consider the leading blob of a URN # a scheme even if we don't so this is not # 100% reliable return '' parts = potential_url.split(':', 1) if not parts[1].startswith('/') or len(parts) < 2: # so if it's our false urn scheme, we # say it has to be :// or it's not a url # and it has to have a split return '' except: return '' pttn = re.compile(ur"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""", re.IGNORECASE) m = match(potential_url, pttn) if not m: return '' return potential_url
def _extract_identifiers(self, text): # make sure it's not a date first text = self._strip_dates(text) if not text: yield '', '' for pattern_type, pattern in _pattern_set: m = match(text, pattern) if not m: continue # if the pattern type doesn't match the # verification ie urn is not actually urn # bounce (this is mostly for urls) urn_verified = self._verify_urn(m) is_urn = urn_verified != '' if urn_verified: yield 'urn', m if pattern_type == 'urn' and not is_urn: continue # NOTE: this should be a bit of dead twig code url_verified = self._verify_url(m) is_url = url_verified.strip() != '' if is_url and not is_urn: if 'dx.doi.org' in m: t = 'doi' elif 'hdl.handle.net' in m: t = 'hdl' else: t = 'url' yield t, m if pattern_type == 'url' and not is_url: continue m = self._tidy_text(m) if not m: continue yield pattern_type, m