def get_response_from_cache(self, service, raw_request, channel_item, channel_params, wsgi_environ, _loads=loads, _CachedResponse=_CachedResponse, _HashCtx=_HashCtx, _sha256=sha256, split_re=regex_compile('........?').findall): """ Returns a cached response for incoming request or None if there is nothing cached for it. By default, an incoming request's hash is calculated by sha256 over a concatenation of: * WSGI REQUEST_METHOD # E.g. GET or POST * WSGI PATH_INFO # E.g. /my/api * sorted(zato.http.GET) # E.g. ?foo=123&bar=456 (query string aka channel_params) * payload bytes # E.g. '{"customer_id":"123"}' - a string object, before parsing Note that query string is sorted which means that ?foo=123&bar=456 is equal to ?bar=456&foo=123, that is, the order of parameters in query string does not matter. """ if service.get_request_hash: hash_value = service.get_request_hash(_HashCtx(raw_request, channel_item, channel_params, wsgi_environ)) else: query_string = str(sorted(channel_params.items())) data = '%s%s%s%s' % (wsgi_environ['REQUEST_METHOD'], wsgi_environ['PATH_INFO'], query_string, raw_request) hash_value = _sha256(data).hexdigest() hash_value = '-'.join(split_re(hash_value)) # No matter if hash value is default or from service, always prefix it with channel's type and ID cache_key = 'http-channel-%s-%s' % (channel_item['id'], hash_value) # We have the key so now we can check if there is any matching response already stored in cache response = self.server.get_from_cache(channel_item['cache_type'], channel_item['cache_name'], cache_key) # If there is any response, we can now load into a format that our callers expect if response: response = _loads(response) response = _CachedResponse(response['payload'], response['content_type'], response['headers'], response['status_code']) return cache_key, response
def lists(self, pattern: str=None) -> List['WikiList']: """Return a list of WikiList objects. :param pattern: The starting pattern for list items. Return all types of lists (ol, ul, and dl) if pattern is None. If pattern is not None, it will be passed to the regex engine, remember to escape the `*` character. Examples: - `\#` means top-level ordered lists - `\#\*` means unordred lists inside an ordered one - Currently definition lists are not well supported, but you can use `[:;]` as their pattern. Tips and tricks: Be careful when using the following patterns as they will probably cause malfunction in the `sublists` method of the resultant List. (However don't worry about them if you are not going to use the `sublists` method.) - Use `\*+` as a pattern and nested unordered lists will be treated as flat. - Use `\*\s*` as pattern to rtstrip `items` of the list. Although the pattern parameter is optional, but specifying it can improve the performance. """ lists = [] lststr = self._lststr type_to_spans = self._type_to_spans spans = type_to_spans.setdefault('WikiList', []) spans_append = spans.append span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get patterns = ('\#', '\*', '[:;]') if pattern is None \ else (pattern,) # type: Tuple[str, ...] for pattern in patterns: list_regex = regex_compile( LIST_PATTERN_FORMAT.replace(b'{pattern}', pattern.encode()), MULTILINE, ) ss = self._span[0] for m in list_regex.finditer(self._shadow): ms, me = m.span() span = [ss + ms, ss + me] old_span = span_tuple_to_span_get((span[0], span[1])) if old_span is None: spans_append(span) else: span = old_span lists.append( WikiList( lststr, pattern, m, type_to_spans, span, 'WikiList' ) ) return lists
class Piston(object): OPTION_PRIORITY_ORDER = ( ("phone_number", PhoneNumberMixin.PHONE_NUMBER_QUERY), ("ip_address", MaxmindMixin.IP_ADDRESS_QUERY), ("latitude_longitude", NominatimMixin.REVERSE_GEOCODE_QUERY), ("longitude_latitude", NominatimMixin.REVERSE_GEOCODE_QUERY), ("latitude", NominatimMixin.REVERSE_GEOCODE_QUERY), ("global", NominatimMixin.TYPICAL_GEOCODE_QUERY), ("subglobal", NominatimMixin.TYPICAL_GEOCODE_QUERY), ("local", NominatimMixin.TYPICAL_GEOCODE_QUERY), ("sublocal", NominatimMixin.TYPICAL_GEOCODE_QUERY), ("postcode", NominatimMixin.TYPICAL_GEOCODE_QUERY), ) LEGAL_CONFIGURATION_OPTIONS = tuple( option for option, _ in OPTION_PRIORITY_ORDER) + ("longitude", "unknown") @property def state(self): return (self.HITS.value, self.MISS.value, self.CONT.value, self.CODE.value, self.FUZZ.value, self.NULL.value, self.FAIL.value) @property def processed(self): return self.__processed.value @classmethod def spark(cls, directory='/', client=Ellipsis, configuration=None, nominatim_host=None, **kwargs): country_geocode = None region_geocode = None phone_geocode = None if configuration is None: if nominatim_host is None: raise ValueError("Cannot run without a configuration and a " "known Nominatim host address!") for dirname, _dirpath, filenames in walk(directory): if country_geocode is None and 'cgeo.json.xz' in filenames: country_geocode = join(dirname, 'cgeo.json.xz') if region_geocode is None and 'rgeo.json.xz' in filenames: region_geocode = join(dirname, 'rgeo.json.xz') if phone_geocode is None and 'pgeo.json.xz' in filenames: phone_geocode = join(dirname, 'pgeo.json.xz') if country_geocode and region_geocode and phone_geocode: break else: nominatim_host = nominatim_host or configuration.getNominatimURI() country_geocode = configuration.getNominatimCountryGeoJSON() region_geocode = configuration.getNominatimRegionGeoJSON() phone_geocode = configuration.getNominatimPhoneGeoJSON() nominatim_host = urlparse(nominatim_host) nominatim_host = "%s://%s/nominatim/" % (nominatim_host.scheme or 'http', nominatim_host.netloc or nominatim_host.path) if country_geocode and region_geocode: configuration = { '_country_geocode': country_geocode, '_region_geocode': region_geocode, 'verbose': ENV.get(ENV.VERBOSE, as_type=int) > 2 } if phone_geocode: configuration['_phone_geocode'] = phone_geocode kwargs.update(configuration) return cls(client, nominatim_host, **kwargs) raise ValueError("Cannot initialize geo.engine.Piston without " "country and region geocode mappings.") @classmethod def generate_field_mapping(cls, config): mapping = {} try: gindex = config.params.geo_index except: gindex = {} for index_type in cls.LEGAL_CONFIGURATION_OPTIONS: for index_field in gindex.get(index_type, ()): mapping[index_field] = index_type return mapping NS = regex_compile( r'[\p{script=Han}\p{script=Tibetan}\p{script=Lao}' r'\p{script=Thai}\p{script=Khmer}]', regex_U) NS = frozenset(NS.findall(u''.join(unichr(i) for i in xrange(maxunicode)))) # Hardcoded geo-string replacement values. HC = {} HC.update({ # Chinese main provinces province: province + u'\u7701' for province in { u'\u6cb3\u5317', u'\u5c71\u897f', u'\u8fbd\u5b81', u'\u5409\u6797', u'\u9ed1\u9f99\u6c5f', u'\u6c5f\u82cf', u'\u6d59\u6c5f', u'\u5b89\u5fbd', u'\u798f\u5efa', u'\u6c5f\u897f', u'\u6cb3\u5357', u'\u5c71\u4e1c', u'\u6e56\u5317', u'\u6e56\u5357', u'\u5e7f\u4e1c', u'\u6d77\u5357', u'\u56db\u5ddd', u'\u8d35\u5dde', u'\u4e91\u5357', u'\u7518\u8083', u'\u9752\u6d77', u'\u53f0\u6e7e', } }) HC[u'\u9655\u897f'] = HC[u'\u9655\u897f\u7701'] = "Shaanxi" # HATE HC[u'\u5c71\u897f'] = HC[u'\u5c71\u897f\u7701'] = "Shanxi" # HAATE HC.update({ # Chinese major cities city: city + u'\u5e02' for city in { u'\u5317\u4eac', u'\u5929\u6d25', u'\u4e0a\u6d77', u'\u91cd\u5e86', } }) HC.update({ # Chinese autonomous regions auto: auto + u'\u81ea\u6cbb\u533a' for auto in { u'\u5167\u8499\u53e4', u'\u5e7f\u897f\u58ee', u'\u897f\u85cf', u'\u5b81\u590f\u56de\u65cf', u'\u65b0\u7586\u7ef4\u543e\u5c14', } }) HC.update({ # Chinese "Special" administrative regions spec: spec + u'\u7279\u522b\u884c\u653f\u533a' for spec in { # u'\u9999\u6e2f', # This is Hong Kong u'\u6fb3\u95e8', } }) def remap_documents(self, document, mapping): information = {field_type: [] for field_type in mapping.values()} for field_name, field_type in mapping.iteritems(): information[field_type].append(document.get(field_name)) return self.remap_information(information) def remap_information(self, information): for field_type in information.iterkeys(): information[field_type] = normalize( 'NFKC', u' '.join( filter(None, (p.strip() if isinstance(p, basestring) else u'' for p in information[field_type])))) if self.NS.intersection(information[field_type]): information[field_type] = self.tokenizer( 'zh', information[field_type]) information[field_type] = self.HC.get(information[field_type], information[field_type]) for field_type, search_type in self.OPTION_PRIORITY_ORDER: geo_lookup = getattr(self, '_' + field_type)(information) if isinstance(geo_lookup, dict): return search_type, geo_lookup return NominatimMixin.TYPICAL_GEOCODE_QUERY, self._unknown(information) LL = regex_compile(r'[^\+\-\.0-9]+') def _latitude_longitude(self, information): try: information['latitude'], information['longitude'] = map( float, filter(None, self.LL.split(information['latitude_longitude']))[:2]) return self._latitude(information) except: return None def _longitude_latitude(self, information): try: information['longitude'], information['latitude'] = map( float, filter(None, self.LL.split(information['longitude_latitude']))[:2]) return self._latitude(information) except: return None def _latitude(self, information): try: lat = '%010.5f' % float(information['latitude']) lon = '%010.5f' % float(information['longitude']) return {'lat': lat, 'lon': lon, 'orig': '%s, %s' % (lat, lon)} except: return None IA = regex_compile(r'[^0-9a-fA-F\:\.]', regex_U) def _ip_address(self, information): try: addresses = filter( None, sum([ self.IA.split(ia) for ia in information['ip_address'].split() ], [])) return {'ip_address': addresses, 'orig': ' / '.join(addresses)} except: return None def _phone_number(self, information): try: return {'phone_number': information['phone_number'].split()} except: return None def __clean_carry_and_extra(self, carry=None, extra=None): if isinstance(carry, Mapping): carry = dict(carry) else: carry = {} if isinstance(extra, basestring): extra = (extra, ) elif isinstance(extra, Container): extra = tuple(extra) else: extra = tuple() return carry, extra CN = regex_compile(r'([\p{L}\p{N}]\P{Z}*[\p{L}\p{N}],?)', regex_U) def _global(self, information, carry=None, extra=None): try: carry, extra = self.__clean_carry_and_extra(carry, extra) global_ = u' '.join(match.group() for match in self.CN.finditer( information.get('global', u''))).strip() if global_: carry['country'] = global_ extra = (global_, ) + extra return self._subglobal(information, carry, extra) except: return None def _subglobal(self, information, carry=None, extra=None): try: carry, extra = self.__clean_carry_and_extra(carry, extra) subglobal_ = u' '.join(match.group() for match in self.CN.finditer( information.get('subglobal', u''))).strip() if subglobal_: carry['state'] = subglobal_ extra = (subglobal_, ) + extra return self._local(information, carry, extra) except: return None def _local(self, information, carry=None, extra=None): try: carry, extra = self.__clean_carry_and_extra(carry, extra) local_ = u' '.join(match.group() for match in self.CN.finditer( information.get('local', u''))).strip() if local_: carry['city'] = local_ extra = (local_, ) + extra return self._sublocal(information, carry, extra) except: return None def _sublocal(self, information, carry=None, extra=None): try: carry, extra = self.__clean_carry_and_extra(carry, extra) sublocal_ = u' '.join(match.group() for match in self.CN.finditer( information.get('sublocal', u''))).strip() if sublocal_: carry['street'] = sublocal_ extra = (sublocal_, ) + extra return self._postcode(information, carry, extra) except: return None def _postcode(self, information, carry=None, extra=None): try: carry, extra = self.__clean_carry_and_extra(carry, extra) postcode = u' '.join(match.group() for match in self.CN.finditer( information.get('postcode', u''))).strip() if postcode: carry['postalcode'] = postcode return self._unknown(information, carry, extra) except: return None def _unknown(self, information, carry=None, extra=None): try: carry, extra = self.__clean_carry_and_extra(carry, extra) unknown = u' '.join(match.group() for match in self.CN.finditer( information.get('unknown', u''))).strip() if unknown: extra = (unknown, ) + extra if extra: carry['q'] = u', '.join( OrderedDict.fromkeys(filter(None, extra))) carry['orig'] = carry['q'] if any(carry.values()): return carry else: return None except: return None @staticmethod def spawn_session(namespace, concurrency=4): session = Session() session.mount(prefix=namespace, adapter=HTTPAdapter(pool_connections=concurrency, pool_maxsize=concurrency * 2, max_retries=0, pool_block=False)) return session def __init__(self, client, nominatim_host, **kwargs): if isinstance(client, MongoClient): self.__client = client elif client is Ellipsis: warn("Ignoring invalid client -- cannot run jobs!", RuntimeWarning) else: raise TypeError("Nominatim must be started with a MongoClient!") self.__ns = kwargs['nominatim_host'] = nominatim_host warn("Geocoding against %s." % self.__ns, UserWarning) self.__map = CentroidUpdateHelper(**kwargs) self.__used = set() self.__cache = CacheDictionary(maxsize=kwargs.get('maxsize', 100000), weakref=False) self.__thread = kwargs.get('concurrent', 4) self.__session = self.spawn_session(namespace=self.__ns, concurrency=self.__thread) self.__processed = Value('i', 0, lock=False) self.__sleep = Value('f', 0.0, lock=True) self.tokenizer = LanguageTokenizer(concurrent=True) self.concurrent = kwargs.get('concurrent', 4) def session_fetch_function(self, url, **kwargs): return self.__session.get(url, timeout=5.0, **kwargs).content def restore_from_cache(self, subdomain, limit=None): if isinstance(limit, int): limit = int( max(min(limit, self.__cache.maxsize), CacheDictionary.CACHE_SIZE_MIN // 100)) else: limit = self.__cache.maxsize self.__cache = CacheDictionary(maxsize=self.__cache.maxsize, weakref=False) for doc in self.__client[subdomain][MC.CACHE_COL].find().sort( RO.LAST, DESCENDING).limit(limit): if 'value' in doc and doc['value']: search_query = doc.pop(RO.OBJECT_ID) self.__cache[CacheDictionary.gen_cache_key( search_query)] = doc['value'] def update_mongo_cache(self, subdomain): action = time() * 1000.0 bulk = self.__client[subdomain][ MC.CACHE_COL].initialize_unordered_bulk_op() for key in self.__used: if not key: continue _id = CacheDictionary.restore_cache_key(key) val = self.__cache.quiet_get(key) if val: bulk.find({ RO.OBJECT_ID: _id }).upsert().update({'$set': { 'value': val, RO.LAST: action }}) bulk.execute() def process(self, config, subdomain=None, pool_size=4, verbose=False): for _ in self.iterprocess(config, subdomain, pool_size, verbose): pass def _report_status_oneline(self, locked, runtime): stdout.write( "[% 9.3f] %d hits / %d misses / %d calls <--> " "%d coded (nom) / %d coded (idf) / %d empty <--> " "%04.2f sleep / %03.2f codes / %d total ~~ %d in iterlock.\r" % (time() - runtime, self.HITS.value, self.MISS.value, self.CONT.value, self.CODE.value, self.FUZZ.value, self.NULL.value, self.__sleep.value, float(self.HITS.value + self.MISS.value) / (time() - runtime), self.HITS.value + self.MISS.value + self.FAIL.value, locked)) stdout.flush() def _report_status_compact(self, locked, runtime): stdout.write( "[% 9.3f] %d in iterlock (%d processed, %.3f per second)\n" " Cache: %d hits / %d misses\n" " Result: %d results -> %d codified\n" " Network: %d calls (sleeping for %04.2f seconds)\n" " Timestamp: %s\n" % ( time() - runtime, locked, self.HITS.value + self.MISS.value + self.FAIL.value, float(self.HITS.value + self.MISS.value) / (time() - runtime), self.HITS.value, self.MISS.value, self.CODE.value, self.FUZZ.value, self.CONT.value, self.__sleep.value, ctime(), )) stdout.flush() def report_status(self, locked, runtime): return self._report_status_compact(locked, runtime) def iterprocess(self, config, subdomain=None, pool_size=4, verbose=False): if verbose: runtime = time() self.__processed = Value('i', 0, lock=True) self.HITS = Value('i', 0, lock=True) self.MISS = Value('i', 0, lock=True) self.CONT = Value('i', 0, lock=True) self.CODE = Value('i', 0, lock=True) self.FUZZ = Value('i', 0, lock=True) self.NULL = Value('i', 0, lock=True) self.FAIL = Value('i', 0, lock=True) subdomain = config['mongo_db'] if subdomain is None else subdomain if subdomain is Ellipsis: pass elif subdomain: if verbose: stdout.write( "[% 9.3f] Beginning retrieval of mongo cache...\n" % (time() - runtime)) self.restore_from_cache(subdomain=subdomain, limit=config.meta.counts.total) if verbose: stdout.write("[% 9.3f] Retrieval of mongo cache complete.\n" % (time() - runtime)) self._config = self.generate_field_mapping(config) pool = ThreadPool(min(max(int(pool_size), 1), self.concurrent)) bulk = self.__client[config.mongo_db][ config.mongo_table].initialize_unordered_bulk_op() locked = LockedIterator( self.__client[config.mongo_db][config.mongo_table].find( {}, projection={field: 1 for field in self._config}), lock_past=self.concurrent * 2150) if verbose: self.report_status(len(locked), runtime) last = time() for _id, geo, err in pool.imap_unordered(self.__process, locked): # for _id, geo, err in imap(self.__process, locked): locked -= 1 self.__processed.value += 1 if verbose and (time() - last) > 5.0: self.report_status(len(locked), runtime) last = time() if not _id or err is not None: stdout.write('\n%s\n' % err) yield err continue bulk.find({RO.OBJECT_ID: _id}).update_one({'$set': {DF.geo: geo}}) yield None if verbose: self.report_status(len(locked), runtime) stdout.write("\n[% 9.3f] Geocoding complete.\n" % (time() - runtime)) pool.close() pool.join() if verbose: stdout.flush() stdout.write("[% 9.3f] Subthreads joined.\n" % (time() - runtime)) bulk.execute() if verbose: stdout.write("[% 9.3f] Bulk insertion of results complete.\n" % (time() - runtime)) if subdomain is Ellipsis: pass elif subdomain: self.update_mongo_cache(subdomain=subdomain) if verbose: stdout.write( "[% 9.3f] Bulk update of mongo cache complete.\n" % (time() - runtime)) yield None def iterprocess_streaming(self, docs, model, subdomain=None, pool_size=4, verbose=False): if verbose: runtime = time() self.__processed = Value('i', 0, lock=True) self.HITS = Value('i', 0, lock=True) self.MISS = Value('i', 0, lock=True) self.CONT = Value('i', 0, lock=True) self.CODE = Value('i', 0, lock=True) self.FUZZ = Value('i', 0, lock=True) self.NULL = Value('i', 0, lock=True) self.FAIL = Value('i', 0, lock=True) if subdomain: if verbose: stdout.write( "[% 9.3f] Beginning retrieval of mongo cache...\n" % (time() - runtime)) self.restore_from_cache(subdomain=subdomain, limit=10000) if verbose: stdout.write("[% 9.3f] Retrieval of mongo cache complete.\n" % (time() - runtime)) self._config = self.generate_field_mapping(model) pool = ThreadPool(min(max(int(pool_size), 1), self.concurrent)) #bulk = self.__client[config.mongo_db][ # config.mongo_table].initialize_unordered_bulk_op() locked = LockedIterator(docs, lock_past=self.concurrent * 2150) if verbose: self.report_status(len(locked), runtime) last = time() for _id, geo, err in pool.imap_unordered(self.__process, locked): # for _id, geo, err in imap(self.__process, locked): locked -= 1 self.__processed.value += 1 if verbose and (time() - last) > 5.0: self.report_status(len(locked), runtime) last = time() if not _id or err is not None: stdout.write('\n%s\n' % err) #yield err continue yield (_id, geo) if verbose: self.report_status(len(locked), runtime) stdout.write("\n[% 9.3f] Geocoding complete.\n" % (time() - runtime)) pool.close() pool.join() if verbose: stdout.flush() stdout.write("[% 9.3f] Subthreads joined.\n" % (time() - runtime)) def fire(self, query, type_=None): if type_ is None: type_, query = self.remap_information(query) return Stroke(host=self.__ns, query=query, assoc=self.__map, cache=self.__cache, search_type=type_, verbose=True, debug=False, _fetch_function=self.session_fetch_function, _catch_exceptions=(ConnectionError, Timeout), _sleep=self.__sleep) def __process(self, dictionary): try: _id = dictionary[RO.OBJECT_ID] type_, query = self.remap_documents(dictionary, self._config) if isinstance(query, dict): geocode = self.fire(query, type_) if geocode.call_was_cached: with self.HITS.get_lock(): self.HITS.value += 1 self.__used.add(geocode.cache_key) else: with self.MISS.get_lock(): self.MISS.value += 1 with self.CONT.get_lock(): self.CONT.value += geocode.calls if geocode.result: result = geocode.result[0] else: result = {} if result.get('id') > 0 or result.get('full'): with self.CODE.get_lock(): self.CODE.value += 1 if result.get('code', {}).get('country'): with self.FUZZ.get_lock(): self.FUZZ.value += 1 else: with self.NULL.get_lock(): self.NULL.value += 1 if query.get('orig'): result['orig'] = query['orig'] else: with self.FAIL.get_lock(): self.FAIL.value += 1 result = {} return _id, result, None except: with self.FAIL.get_lock(): self.FAIL.value += 1 return None, {}, format_exc()
from datetime import date from urllib.parse import urlparse from regex import compile as regex_compile from requests import ConnectionError as RequestsConnectionError from lib.commons import dict_to_sfn_cit_ref from lib.urls import ( urls_sfn_cit_ref, url2dict, get_home_title, get_html, find_authors, find_journal, find_site_name, find_title, ContentTypeError, ContentLengthError, StatusCodeError, TITLE_TAG ) URL_FULLMATCH = regex_compile( r'https?+://web(?:-beta)?+\.archive\.org/(?:web/)?+' r'(\d{4})(\d{2})(\d{2})\d{6}(?>cs_|i(?>d_|m_)|js_)?+/(http.*)' ).fullmatch def waybackmachine_sfn_cit_ref( archive_url: str, date_format: str = '%Y-%m-%d' ) -> tuple: """Create the response namedtuple.""" m = URL_FULLMATCH(archive_url) if not m: # Could not parse the archive_url. Treat as an ordinary URL. return urls_sfn_cit_ref(archive_url, date_format) archive_year, archive_month, archive_day, original_url = \ m.groups() original_dict = {} thread = Thread(
"""Define the ExternalLink class.""" from typing import Optional from regex import compile as regex_compile from ._spans import VALID_EXTLINK_CHARS from ._wikitext import SubWikiText URL_MATCH = regex_compile(VALID_EXTLINK_CHARS).match class ExternalLink(SubWikiText): """Create a new ExternalLink object.""" @property def url(self) -> str: """Return the url.""" if self[0] == '[': return self[1:URL_MATCH(self._ext_link_shadow, 1).end()] return self.string @url.setter def url(self, newurl: str) -> None: """Set a new url.""" if self[0] == '[': self[1:len('[' + self.url)] = newurl else: self[0:len(self.url)] = newurl @property def text(self) -> Optional[str]:
"""Define the Argument class.""" from typing import Dict, List, MutableSequence, Optional, Union from regex import compile as regex_compile, MULTILINE, DOTALL from ._wikitext import SubWikiText, SECTION_HEADING ARG_SHADOW_FULLMATCH = regex_compile( rb'[|:](?<pre_eq>(?:[^=]*+(?:' + SECTION_HEADING + rb'\n)?+)*+)(?:\Z|(?<eq>=)(?<post_eq>.*+))', MULTILINE | DOTALL).fullmatch class Argument(SubWikiText): """Create a new Argument Object. Note that in MediaWiki documentation `arguments` are (also) called parameters. In this module the convention is: {{{parameter}}}, {{template|argument}}. See https://www.mediawiki.org/wiki/Help:Templates for more information. """ __slots__ = '_shadow_match_cache', '_parent' def __init__( self, string: Union[str, MutableSequence[str]], _type_to_spans: Optional[Dict[str, List[List[int]]]] = None, _span: Optional[List[int]] = None, _type: Optional[Union[str, int]] = None, _parent: 'SubWikiTextWithArgs' = None, ):
"""All things that are specifically related to adinebook website""" from collections import defaultdict from logging import getLogger from typing import Optional from langid import classify from regex import compile as regex_compile from requests import RequestException from mechanicalsoup import StatefulBrowser from lib.commons import first_last, dict_to_sfn_cit_ref, request, USER_AGENT,\ LANG ISBN_SEARCH = regex_compile(r'ISBN: </b> ([-\d]++)').search DATE_SEARCH = regex_compile( r'تاریخ نشر:</b>(?<year>\d\d)/(?<month>\d\d)/(?<day>\d\d)').search PUBLISHER_SEARCH = regex_compile( r'Publisher_ctl00_NameLabel" class="linkk">(.*?)</span>').search VOLUME_SEARCH = regex_compile(r'\bجلد (\d+)').search TITLE_SEARCH = regex_compile(r'BookTitle" class="h4">([^<]++)').search AUTHORS_FINDALL = regex_compile( r'rptAuthor_ctl\d\d_NameLabel" class="linkk">([^>:]++):([^<]++)<').findall LOCATION_SEARCH = regex_compile(r'محل نشر:</b>([^<]++)<').search def ketabir_scr(url: str, date_format='%Y-%m-%d') -> tuple: """Return the response namedtuple.""" dictionary = url2dictionary(url) dictionary['date_format'] = date_format if 'language' not in dictionary:
from typing import Optional from langid import classify from regex import compile as regex_compile, DOTALL from isbnlib import info as isbn_info from config import LANG from lib.ketabir import url2dictionary as ketabir_url2dictionary from lib.ketabir import isbn2url as ketabir_isbn2url from lib.bibtex import parse as bibtex_parse from lib.commons import dict_to_sfn_cit_ref, request, ISBN13_SEARCH, \ ISBN10_SEARCH from lib.ris import ris_parse OTTOBIB_SEARCH = regex_compile( '<textarea[^>]*+>(.*?)</textarea>', DOTALL, ).search RM_DASH_SPACE = str.maketrans('', '', '- ') class IsbnError(Exception): """Raise when bibliographic information is not available.""" pass def isbn_scr(isbn_container_str: str, pure: bool = False, date_format: str = '%Y-%m-%d') -> tuple: """Create the response namedtuple."""
# TAG_CONTENTS = r'(?<contents>(?>(?!{TAG}).)*?)'.format( # TAG=TAG.format(**locals()) # ) # TAG_FINDITER = regex_compile( # TAG.format(**locals()), flags=DOTALL | VERBOSE # ).finditer # Note that the following regex won't check for nested tags TAG_FULLMATCH = regex_compile( rb''' # Note that the start group does not include the > character <''' + ASCII_TAG_NAME + ATTR_PATTERN + rb'''* # Todo: Possessive? # After the attributes, or after the tag name if there are no attributes, # there may be one or more space characters. This is sometimes required but # ignored here. (?<attr_insert>) [''' + SPACE_CHARS + rb''']*+ (?> (?<self_closing>/\s*>) |>(?<contents>.*?)''' + END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>(?P=name))') + rb'''|> # only start; no end tag )''', DOTALL | VERBOSE, ).fullmatch class SubWikiTextWithAttrs(SubWikiText): """Define a class for SubWikiText objects that have attributes. Any class that is going to inherit from SubWikiTextWithAttrs should provide
"""Define the Comment class.""" from typing import Dict, List, MutableSequence, Optional, Union from regex import MULTILINE, compile as regex_compile from ._wikitext import SubWikiText from ._spans import COMMENT_PATTERN COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ")*+" COMMENT_COMMA = "(?>" + COMMENT_PATTERN + ")*+'" BOLD_FULLMATCH = regex_compile( COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + "|$)", MULTILINE).fullmatch ITALIC_FULLMATCH = regex_compile(COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA + "|$)").fullmatch ITALIC_NOEND_FULLMATCH = regex_compile(COMMA_COMMENT + "'(.*)").fullmatch class Comment(SubWikiText): __slots__ = () @property def contents(self) -> str: """Return contents of this comment.""" return self(4, -3) @property def comments(self) -> List['Comment']: return []
# jB_TO_NUM contains entries for both ی and ي jB_TO_NUM = { 'فروردین': 1, 'اردیبهشت': 2, 'خرداد': 3, 'تیر': 4, 'مرداد': 5, 'شهریور': 6, 'مهر': 7, 'آبان': 8, 'آذر': 9, 'دی': 10, 'بهمن': 11, 'اسفند': 12} DOUBLE_DIGIT_SEARCH = regex_compile(r'\d\d').search # Date patterns: # January|February... B = ( r''' (?<B>(?:J(?:anuary|u(?:ne|ly)) | February | Ma(?:rch|y) | A(?:pril|ugust) | (?:(?:(?:Sept|Nov|Dec)em)|Octo)ber))
RIS_FULLMATCH = regex_compile( r''' (?: # this group matches any line ^ (?> A[U\d]\ {2}-\ (?<author>.++) |DA\ {2}-\ \d++/(?<month>\d++).*+ |EP\ {2}-\ (?<end_page>.++) |IS\ {2}-\ (?<issue>.++) |J[FA]\ {2}-\ (?<journal>.++) |LA\ {2}-\ (?<language>.++) |P(?> B\ {2}-\ (?<publisher>.++) |Y\ {2}-\ (?<year>\d++).*+ ) |S(?> N\ {2}-\ (?<isbn>\S*+).*+ |P\ {2}-\ (?<start_page>.++) ) |T(?> [1I]\ {2}-\ (?<title>.++) |3\ {2}-\ (?<series>.++) |Y\ {2}-\ (?<type>.++) ) |UR\ {2}-\ (?<url>.++) |VL\ {2}-\ (?<volume>.++) |Y1\ {2}-\ (?<year>\d++).*+ # any other line |[^\n]*+ ) \n )* ''', VERBOSE | MULTILINE, ).fullmatch
"""Codes required to create English Wikipedia citation templates.""" from datetime import date as datetime_date from functools import partial from collections import defaultdict from logging import getLogger from regex import compile as regex_compile from lib.language import TO_TWO_LETTER_CODE # Includes ShortDOIs (See: http://shortdoi.org/) and # https://www.crossref.org/display-guidelines/ DOI_URL_MATCH = regex_compile(r'https?://(dx\.)?doi\.org/').match refless = partial(regex_compile( r'( \| ref=({{.*?}}|harv))(?P<repl> \| |}})' ).sub, r'\g<repl>') TYPE_TO_CITE = { # BibTex types. Descriptions are from # http://ctan.um.ac.ir/biblio/bibtex/base/btxdoc.pdf # A part of a book, which may be a chapter (or section or whatever) and/or # a range of pages. 'inbook': 'book', # A work that is printed and bound, but without a named publisher or # sponsoring institution. # Note: Yadkard does not currently support the `howpublished` option. 'booklet': 'book',
from langid import classify from regex import compile as regex_compile, DOTALL from config import LANG from lib.ketabir import url2dictionary as ketabir_url2dictionary from lib.ketabir import isbn2url as ketabir_isbn2url from lib.bibtex import parse as bibtex_parse from lib.commons import dict_to_sfn_cit_ref, request # , Name from lib.ris import parse as ris_parse # original regex from: # https://www.debuggex.com/r/0Npla56ipD5aeTr9 # https://www.debuggex.com/r/2s3Wld3CVCR1wKoZ ISBN_10OR13_SEARCH = regex_compile( r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*){9}\d' r'|(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]' ).search ISBN10_SEARCH = regex_compile( r'(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]' ).search ISBN13_SEARCH = regex_compile( r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}\d' ).search # original regex from: http://stackoverflow.com/a/14260708/2705757 # ISBN_REGEX = regex_compile( # r'(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)' # r'?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]'
from regex import compile as regex_compile, VERBOSE, IGNORECASE from requests import Response as RequestsResponse from requests.exceptions import RequestException from lib.commons import ( find_any_date, dict_to_sfn_cit_ref, ANYDATE_PATTERN, request) from lib.urls_authors import find_authors, CONTENT_ATTR MAX_RESPONSE_LENGTH = 2000000 # https://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html CHARSET = regex_compile( rb''' <meta(?!\s*+(?>name|value)\s*+=)[^>]*?charset\s*+=[\s"']*+([^\s"'/>]*) ''', IGNORECASE | VERBOSE, ).search TITLE_META_NAME_OR_PROP = r''' (?>name|property)=(?<q>["\']) (?>citation_title|title|Headline|og:title) (?P=q) ''' TITLE_SEARCH = regex_compile( r'<meta\s++(?:' + TITLE_META_NAME_OR_PROP + r'\s++' + CONTENT_ATTR + '|' + CONTENT_ATTR + r'\s++' + TITLE_META_NAME_OR_PROP + ')' '|'
CAPTION_MATCH = regex_compile( r""" # Everything until the caption line (?P<preattrs> # Start of table {\| (?: (?: (?!\n\s*+\|) [\s\S] )*? ) # Start of caption line \n\s*+\|\+ ) # Optional caption attrs (?: (?P<attrs>[^\n|]*+) (?:\|) (?!\|) )? (?P<caption>.*?) # End of caption line (?: \n| \|\| ) """, VERBOSE ).match T = TypeVar('T')
"""Define the Section class.""" from regex import compile as regex_compile from ._wikitext import SubWikiText HEADER_MATCH = regex_compile(rb'(={1,6})([^\n]+?)\1[ \t]*(\n|\Z)').match class Section(SubWikiText): """Section class is used to represent page sections.""" _header_match_cache = (None, None) @property def _header_match(self): cached_match, cached_shadow = self._header_match_cache shadow = self._shadow if cached_shadow == shadow: return cached_match m = HEADER_MATCH(shadow) self._header_match_cache = m, shadow return m @property def level(self) -> int: """The level of this section. getter: Return level which as an int in range(1,7) or 0 for the lead section. setter: Change the level.
def test_end_tag_patterns(): assert regex_compile(END_TAG_PATTERN.replace( b'{name}', b'p')).search(b'</p>').groupdict() == { 'end_tag': b'</p>' }
RIS_FULLMATCH = regex_compile( r''' (?: # this group matches any line ^ (?> A[U\d]\ {2}-\ (?<author>.++) |DA\ {2}-\ \d++/(?<month>\d++).*+ |EP\ {2}-\ (?<end_page>.++) |IS\ {2}-\ (?<issue>.++) |J[FA]\ {2}-\ (?<journal>.++) |LA\ {2}-\ (?<language>.++) |P(?> B\ {2}-\ (?<publisher>.++) |Y\ {2}-\ (?<year>\d++).*+ ) |S(?> N\ {2}-\ (?<isbn>.++) |P\ {2}-\ (?<start_page>.++) ) |T(?> [1I]\ {2}-\ (?<title>.++) |3\ {2}-\ (?<series>.++) |Y\ {2}-\ (?<type>.++) ) |UR\ {2}-\ (?<url>.++) |VL\ {2}-\ (?<volume>.++) |Y1\ {2}-\ (?<year>\d++).*+ # any other line |[^\n]*+ ) \n )* ''', VERBOSE | MULTILINE, ).fullmatch
#! /usr/bin/python # -*- coding: utf-8 -*- """Test urls_authors.BYLINE_PATTERN.""" from regex import compile as regex_compile, VERBOSE, IGNORECASE from unittest import main, expectedFailure, TestCase from lib.urls_authors import byline_to_names, BYLINE_PATTERN BYLINE_PATTERN_REGEX = regex_compile( '^' + BYLINE_PATTERN + '$', IGNORECASE | VERBOSE ) class RegexTest(TestCase): """BYLINE_PATTERN should pass the following tests.""" def test_one_author(self): """http://www.defense.gov/News/NewsArticle.aspx?ID=18509""" text = 'By Jim Garamone' self.assertRegex(text, BYLINE_PATTERN_REGEX) def test_cap_names_joined_by_and(self): """Test two authors with and. Example: https://www.eff.org/deeplinks/2014/06/
#! /usr/bin/python # -*- coding: utf-8 -*- """Test urls_authors.BYLINE_PATTERN.""" from regex import compile as regex_compile, VERBOSE, IGNORECASE import unittest from src.urls_authors import byline_to_names, BYLINE_PATTERN BYLINE_PATTERN_REGEX = regex_compile('^' + BYLINE_PATTERN + '$', IGNORECASE | VERBOSE) class RegexTest(unittest.TestCase): """BYLINE_PATTERN should pass the following tests.""" def test_one_author(self): """http://www.defense.gov/News/NewsArticle.aspx?ID=18509""" text = 'By Jim Garamone' self.assertRegex(text, BYLINE_PATTERN_REGEX) def test_cap_names_joined_by_and(self): """Test two authors with and. Example: https://www.eff.org/deeplinks/2014/06/ sudan-tech-sanctions-harm-innovation-development-us-government-and- corporations-must-act Note the two consecutive spaces. """
# -*- coding: utf-8 -*- """Codes specifically related to PubMed inputs.""" from collections import defaultdict from config import NCBI_API_KEY, NCBI_EMAIL, NCBI_TOOL from datetime import datetime from logging import getLogger from threading import Thread from regex import compile as regex_compile from lib.commons import dict_to_sfn_cit_ref, b_TO_NUM, request from lib.doi import get_crossref_dict NON_DIGITS_SUB = regex_compile(r'[^\d]').sub NCBI_URL = ( 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?' 'api_key=' + NCBI_API_KEY + '&retmode=json&tool=' + NCBI_TOOL + '&email=' + NCBI_EMAIL) PUBMED_URL = NCBI_URL + '&db=pubmed&id=' PMC_URL = NCBI_URL + '&db=pmc&id=' class NCBIError(Exception): pass def pmid_sfn_cit_ref(pmid: str, date_format='%Y-%m-%d') -> tuple:
"""All things that are specifically related to adinebook website""" from collections import defaultdict from logging import getLogger from typing import Optional from langid import classify from regex import compile as regex_compile from requests import RequestException from mechanicalsoup import StatefulBrowser from lib.commons import first_last, dict_to_sfn_cit_ref, request, USER_AGENT,\ LANG ISBN_SEARCH = regex_compile(r'ISBN: </b> ([-\d]++)').search DATE_SEARCH = regex_compile( r'تاریخ نشر:</b>(?<year>\d\d)/(?<month>\d\d)/(?<day>\d\d)').search PUBLISHER_SEARCH = regex_compile( r'Publisher_ctl00_NameLabel" class="linkk">(.*?)</span>').search VOLUME_SEARCH = regex_compile(r'\bجلد (\d+)').search TITLE_SEARCH = regex_compile(r'BookTitle" class="h4">([^<]++)').search AUTHORS_FINDALL = regex_compile( r'rptAuthor_ctl\d\d_NameLabel" class="linkk">([^>:]++):([^<]++)<').findall LOCATION_SEARCH = regex_compile(r'محل نشر:</b>([^<]++)<').search def ketabir_sfn_cit_ref(url: str, date_format='%Y-%m-%d') -> tuple: """Return the response namedtuple.""" dictionary = url2dictionary(url) dictionary['date_format'] = date_format
# Todo: can the tags method be implemented using a TAG_FINDITER? Will # that be more performant? # TAG_FINDITER should not find any tag containing other tags. # TAG_CONTENTS = r'(?<contents>(?>(?!{TAG}).)*?)'.format( # TAG=TAG.format(**locals()) # ) # TAG_FINDITER = regex_compile( # TAG.format(**locals()), flags=DOTALL | VERBOSE # ).finditer # Note that the following regex won't check for nested tags TAG_FULLMATCH = regex_compile( rb''' <(?<name>[A-Za-z0-9]++)''' + ATTRS_PATTERN + rb''' [''' + SPACE_CHARS + rb''']*+ (?> (?<self_closing>/\s*>) |>(?<contents>.*)''' + END_TAG_PATTERN.replace( rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)') + # noqa rb'''|> # only start; no end tag )''', DOTALL | VERBOSE).fullmatch class SubWikiTextWithAttrs(SubWikiText): """Define a class for SubWikiText objects that have attributes. Any class that is going to inherit from SubWikiTextWithAttrs should provide _attrs_match property. Note that matching should be done on shadow. It's usually a good idea to cache the _attrs_match property. """
class NominatimMixin(object): NS = regex_compile(r'[\p{script=Han}\p{script=Tibetan}\p{script=Lao}' r'\p{script=Thai}\p{script=Khmer}]', regex_U) NS = frozenset(NS.findall(u''.join(unichr(i) for i in xrange(maxunicode)))) # String geocode and reverse geocode operations provided by Nominatim TYPICAL_GEOCODE_QUERY = "geocode" TYPICAL_GEOCODE_SCRIPT = "search.php?" ATTEMPT_GEOCODE_ADJUST = regex_compile(r'[^\p{L}\p{N}\p{M},]', regex_U) CONSIDERATION_PRIORITY = ( 'street', 'postalcode', 'county', 'city', 'state', 'country') CONSIDERATION_ATTEMPTS = tuple( map(frozenset, ( ('country', 'state', 'city', 'street'), ('country', 'state', 'city', 'county'), ('country', 'postalcode'), ('country', 'state', 'city',), ('state', 'city'), ('country', 'state'), ('country', 'city'), ('state', 'county'), ('country', 'street'), ('state', 'street'), ('city', 'street'), ('county', 'street'), ('postalcode', 'street'), ('country',), ('state',), ('postalcode',), ('city',), ('street',), ))) BLACKLIST_PHRASES = frozenset([ "other", "n/a", "none", "unknown", "nowhere", "null", u'\u6d77\u5916', u'\u5176\u4ed6', u'\u5176\u5b83']) __slots__ = () # This class simply stores methods, no __dict__ needed. @staticmethod def __urlencode_query(params): return urlencode(sorted([(k.encode('utf-8'), v.encode('utf-8')) for k, v in params.items()])) def run_geocode(self, query, errors): body = self.fxn(self._ns + query) try: return loads(body.strip()) except: if "DB Error" in body: self.calls -= 1 wait = self._sleep.value * 10 if self._debug: stdout.write( "\nDetected PostgreSQL database " "error. Sleeping for %.2f seconds." "\nBad Request: %s\n" % (wait, query)) if "DB Error" in errors: self.calls += 1 errors.remove("DB Error") else: errors.add("DB Error") sleep(wait) elif "Internal Server Error" in body: if self._debug: stdout.write( "\nDetected Nominatim internal error." "\nBad Request: %s\n" % query) sleep(self._sleep.value) else: if self._debug: stdout.write( "\nEncountered unknown error.\n%s\n%s" "\nBad Request: %s\n" % (body, format_exc(), query)) sleep(self._sleep.value) raise NominatimResponseError( query, body, "Response was not legal JSON.") @classmethod def get_geocode(cls, query, attempt=0, juggle=False): for value in query.itervalues(): substrings = list(filter(None, value.lower().strip().split())) if len(substrings) > 2: continue for substring in substrings: for ignore in cls.BLACKLIST_PHRASES: if substring.startswith(ignore): return None params = {field: query[field] for field in cls.CONSIDERATION_PRIORITY if field in query and query[field]} if params: for keyset in cls.CONSIDERATION_ATTEMPTS: if keyset.difference(params): continue if attempt > 0: attempt -= 1 else: return {field: params[field] for field in keyset} if any(cls.NS.intersection(value) for value in query.itervalues()): left2right = 0 else: left2right = 1 if attempt > 0: if juggle: attempt += 1 if attempt % 2 == left2right: cut = slice(int(attempt // 2), None, None) else: cut = slice(None, -int(attempt // 2), None) else: if left2right: cut = slice(int(attempt), None, None) else: cut = slice(None, -int(attempt), None) else: cut = slice(None, None, None) if 'q' not in query or not query['q'].strip(): query['q'] = u', '.join( query[field].strip() for field in cls.CONSIDERATION_PRIORITY if field in query and query[field].strip()) query_input = query['q'] if u',' in query_input: split_input = u','.join(query_input.split(u',')[cut]).split() else: split_input = cls.ATTEMPT_GEOCODE_ADJUST.split(query_input)[cut] if split_input: return {"q": u' '.join(split_input).rstrip(u',')} else: return None def res_geocode(self, query, errors): query.update(self.arguments) return self.run_geocode(self.TYPICAL_GEOCODE_SCRIPT + self.__urlencode_query(query), errors) REVERSE_GEOCODE_QUERY = "reverse" REVERSE_GEOCODE_SCRIPT = "reverse.php?" @classmethod def get_reverse(cls, query, attempt=0): # For an explanation of recorded accuracy see: # https://en.wikipedia.org/wiki/Decimal_degrees#Precision if attempt == 0: return {"lat": query["lat"], "lon": query["lon"]} else: return None def res_reverse(self, query, errors): query.update(self.arguments) return self.run_geocode(self.REVERSE_GEOCODE_SCRIPT + self.__urlencode_query(query), errors)
PM_PF_TL_FINDITER = regex_compile( rb'\{\{' rb'(?>' # param rb'\{(?>[^{}]*+|}(?!})|{(?!{))*+\}\}\}()' rb'|' # parser function rb'\s*+' # generated pattern: _config.regex_pattern(_config._parser_functions) # with \#[^{}\s:]++ added manually. rb'(?>\#[^{}\s:]++|u(?>rlencode|c(?:first)?+)|s(?>ubst|afesubst)|raw|p(?>l' rb'ural|ad(?>right|left))|nse?+|msg(?:nw)?+|l(?>ocalurl|c(?:first)?+)|int|' rb'g(?>rammar|ender)|f(?>ullurl|ormatnum|ilepath)|canonicalurl|anchorencod' rb'e|TALK(?>SPACEE?+|PAGENAMEE?+)|SUB(?>PAGENAMEE?+|JECT(?>SPACEE?+|PAGENA' rb'MEE?+))|R(?>OOTPAGENAMEE?+|EVISION(?>YEAR|USER|TIMESTAMP|MONTH1?+|ID|DA' rb'Y2?+))|P(?>ROTECTION(?>LEVEL|EXPIRY)|AGE(?>SI(?>ZE|N(?>N(?>S|AMESPACE)|' rb'CAT(?:EGORY)?+))|NAMEE?+|ID))|N(?>UM(?>INGROUP|BER(?>OF(?>VIEWS|USERS|P' rb'AGES|FILES|EDITS|A(?>RTICLES|DMINS|CTIVEUSERS))|INGROUP))|AMESPACE(?>NU' rb'MBER|E)?+)|FULLPAGENAMEE?+|D(?>ISPLAYTITLE|EFAULT(?>SORT(?:KEY)?+|CATEG' rb'ORYSORT))|CASCADINGSOURCES|BASEPAGENAMEE?+|ARTICLE(?>SPACEE?+|PAGENAMEE' rb'?+))' # end of generated part rb':(?>[^{}]*+|}(?!})|{(?!{))*+\}\}()' rb'|' # invalid template name rb'[\s_]*+' # invalid name rb'(?:\|(?>[^{}]++|{(?!{)|}(?!}))*+)?+' # args rb'\}\}()' rb'|' # template rb'\s*+' + VALID_TITLE_CHARS_PATTERN + # template name rb'\s*+' rb'(?:\|(?>[^{}]++|{(?!{)|}(?!}))*+)?+' # args rb'\}\}' rb')').finditer
,?\ ++and\ {NAME_PATTERN}( ,\ {NAME_PATTERN}( ,\ {NAME_PATTERN} | ,?\ ++and\ {NAME_PATTERN} )? | ,?\ ++and\ {NAME_PATTERN}( ,\ {NAME_PATTERN} | ,?\ ++and\ {NAME_PATTERN} )? )? )?\s* '''.format_map(locals()) BYLINE_PATTERN_SEARCH = regex_compile(BYLINE_PATTERN, VERBOSE | IGNORECASE) NORMALIZE_ANDS = regex_compile(r'\s++and\s++', IGNORECASE).sub NORMALIZE_COMMA_SPACES = regex_compile(r'\s*+,\s++', IGNORECASE).sub BY_PREFIX = regex_compile( r''' ^(?: (?> [^b]++ | (?<!\b)b |b(?!y) )*+ \bby\s++ )? ([^\r\n]++)
"""Define the ExternalLink class.""" from typing import Optional, List from regex import compile as regex_compile from ._wikitext import SubWikiText, BRACKET_EXTERNAL_LINK_URL URL_MATCH = regex_compile(BRACKET_EXTERNAL_LINK_URL).match class ExternalLink(SubWikiText): __slots__ = () @property def url(self) -> str: """URL of the current ExternalLink object. getter: Return the URL. setter: Set a new value for URL. Convert add brackets for bare external links. """ if self(0) == '[': return self(1, URL_MATCH(self._ext_link_shadow, 1).end()) return self.string @url.setter def url(self, newurl: str) -> None: if self(0) == '[': self[1:len('[' + self.url)] = newurl
import logging from threading import Thread from datetime import date from urllib.parse import urlparse from regex import compile as regex_compile from requests import ConnectionError as RequestsConnectionError from lib.commons import dict_to_sfn_cit_ref from lib.urls import (urls_scr, url2dict, get_home_title, get_html, find_authors, find_journal, find_site_name, find_title, ContentTypeError, ContentLengthError, StatusCodeError, TITLE_TAG) URL_FULLMATCH = regex_compile( r'https?+://web(?:-beta)?+\.archive\.org/(?:web/)?+' r'(\d{4})(\d{2})(\d{2})\d{6}(?>cs_|i(?>d_|m_)|js_)?+/(http.*)').fullmatch def waybackmachine_scr(archive_url: str, date_format: str = '%Y-%m-%d') -> tuple: """Create the response namedtuple.""" m = URL_FULLMATCH(archive_url) if not m: # Could not parse the archive_url. Treat as an ordinary URL. return urls_scr(archive_url, date_format) archive_year, archive_month, archive_day, original_url = \ m.groups() original_dict = {} thread = Thread(target=original_url2dict, args=(original_url, original_dict))
jB_TO_NUM = { 'فروردین': 1, 'اردیبهشت': 2, 'خرداد': 3, 'تیر': 4, 'مرداد': 5, 'شهریور': 6, 'مهر': 7, 'آبان': 8, 'آذر': 9, 'دی': 10, 'بهمن': 11, 'اسفند': 12 } DOUBLE_DIGIT_SEARCH = regex_compile(r'\d\d').search # Date patterns: # January|February... B = (r''' (?<B>(?:J(?:anuary|u(?:ne|ly)) | February | Ma(?:rch|y) | A(?:pril|ugust) | (?:(?:(?:Sept|Nov|Dec)em)|Octo)ber)) ''')
#! /usr/bin/python # -*- coding: utf-8 -*- """Codes specifically related to Noormags website.""" from threading import Thread from regex import compile as regex_compile from lib.commons import dict_to_sfn_cit_ref, request from lib.bibtex import parse as bibtex_parse from lib.ris import parse as ris_parse BIBTEX_ARTICLE_ID_SEARCH = regex_compile(r'(?<=/citation/bibtex/)\d+').search RIS_ARTICLE_ID_SEARCH = regex_compile(r'(?<=/citation/ris/)\d+').search def noormags_sfn_cit_ref(url: str, date_format: str = '%Y-%m-%d') -> tuple: """Create the response namedtuple.""" ris_collection = {} ris_thread = Thread(target=ris_fetcher_thread, args=(url, ris_collection)) ris_thread.start() dictionary = bibtex_parse(get_bibtex(url)) dictionary['date_format'] = date_format # language parameter needs to be taken from RIS # other information are more accurate in bibtex # for example: http://www.noormags.ir/view/fa/articlepage/104040 # "IS - 1" is wrong in RIS but "number = { 45 }," is correct in bibtex ris_thread.join() dictionary.update(ris_collection)
"""Define the functions required for parsing wikitext into spans.""" from typing import Dict, List, Callable, Any, Optional from regex import VERBOSE, IGNORECASE from regex import compile as regex_compile # According to https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars # illegal title characters are: r'[]{}|#<>[\u0000-\u0020]' VALID_TITLE_CHARS_PATTERN = rb'[^\x00-\x1f\|\{\}\[\]<>\n]++' # Templates TEMPLATE_FINDITER = regex_compile( rb'\{\{\s*+' # name + VALID_TITLE_CHARS_PATTERN + rb''' \s*+ (?>\|[^{}]*+)?+ # optional args \}\}''', VERBOSE, ).finditer INVALID_TL_NAME_FINDITER = regex_compile( rb''' \{\{ [\s_]*+ # invalid name (?>\|[^{}]*)?+ # optional args \}\} ''', VERBOSE, ).finditer # Parameters PARAMETER_FINDITER = regex_compile(
from ._spans import ATTRS_MATCH from ._tag import SubWikiTextWithAttrs from ._wikitext import WS CAPTION_MATCH = regex_compile( rb""" # Everything until the caption line (?P<preattrs> # Start of table {\| (?: (?: (?!\n\s*+\|) [\s\S] )*? ) # Start of caption line \n\s*+\|\+ ) # Optional caption attrs (?: (?P<attrs>[^\n|]*+) \|(?!\|) )? (?P<caption>.*?) (?:\n[\|\!]|\|\|) """, DOTALL | VERBOSE).match T = TypeVar('T') HEAD_DIGITS = regex_compile(rb'\s*+\d+').match
from regex import VERBOSE, IGNORECASE from regex import compile as regex_compile from ._config import (_parsable_tag_extensions, regex_pattern, _unparsable_tag_extensions, _bare_external_link_schemes, _parser_functions, _HTML_TAG_NAME) # According to https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars # illegal title characters are: r'[]{}|#<>[\u0000-\u0020]' VALID_TITLE_CHARS_PATTERN = rb'[^\x00-\x1f\|\{\}\[\]<>\n]++' # Parameters # Parser functions # According to https://www.mediawiki.org/wiki/Help:Magic_words # See also: # https://translatewiki.net/wiki/MediaWiki:Sp-translate-data-MagicWords/fa PARAMS_FINDITER = regex_compile( rb'\{\{\{(?>[^{}]*+|}(?!})|{(?!{))*+\}\}\}').finditer PF_TL_FINDITER = regex_compile(rb'\{\{' rb'(?>' # parser function rb'\s*+' rb'(?>\#[^{}\s:]++|' + regex_pattern(_parser_functions).encode()[3:] + # end of generated part rb':(?>[^{}]*+|}(?!})|{(?!{))*+\}\}()' rb'|' # invalid template name rb'[\s_]*+' # invalid name rb'(?:\|(?>[^{}]++|{(?!{)|}(?!}))*+)?+' # args rb'\}\}()' rb'|' # template
from langid import classify from regex import compile as regex_compile, DOTALL from config import LANG from lib.ketabir import url2dictionary as ketabir_url2dictionary from lib.ketabir import isbn2url as ketabir_isbn2url from lib.bibtex import parse as bibtex_parse from lib.commons import dict_to_sfn_cit_ref, request # , Name from lib.ris import parse as ris_parse # original regex from: # https://www.debuggex.com/r/0Npla56ipD5aeTr9 # https://www.debuggex.com/r/2s3Wld3CVCR1wKoZ ISBN_10OR13_SEARCH = regex_compile( r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*){9}\d' r'|(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]').search ISBN10_SEARCH = regex_compile( r'(?=\d{1,5}([ -]?+)\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}[\dX]').search ISBN13_SEARCH = regex_compile( r'97[89]([ -]?+)(?=\d{1,5}\1?+\d{1,7}\1?+\d{1,6}\1?+\d)(?:\d\1*+){9}\d' ).search # original regex from: http://stackoverflow.com/a/14260708/2705757 # ISBN_REGEX = regex_compile( # r'(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)' # r'?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]' # )
from urllib.parse import unquote from html import unescape from langid import classify from regex import compile as regex_compile, VERBOSE from lib.commons import dict_to_sfn_cit_ref, request from config import LANG # The regex is from: # http://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page DOI_SEARCH = regex_compile( r''' \b( 10\.[0-9]{4,}+ (?:\.[0-9]++)*+ /[^"&\'\s]++ )\b ''', VERBOSE, ).search def doi_scr(doi_or_url, pure=False, date_format='%Y-%m-%d') -> tuple: """Return the response namedtuple.""" if pure: doi = doi_or_url else: # unescape '&', '<', and '>' in doi_or_url # decode percent encodings decoded_url = unquote(unescape(doi_or_url)) doi = DOI_SEARCH(decoded_url)[1]
VALID_EXTLINK_CHARS, BARE_EXTLINK_SCHEMES_PATTERN, ) # External links (comment inclusive) BRACKET_EXTERNALLINK_PATTERN = ( rb'\[(?>//|' + BARE_EXTLINK_SCHEMES_PATTERN + rb')' + VALID_EXTLINK_CHARS + rb'\ *+[^\]\n]*+\]' ) BARE_EXTERNALLINK_PATTERN = ( rb'(?>' + BARE_EXTLINK_SCHEMES_PATTERN + rb')' + VALID_EXTLINK_CHARS ) EXTERNALLINK_FINDITER = regex_compile( rb'(?:' + BARE_EXTERNALLINK_PATTERN + rb'|' + BRACKET_EXTERNALLINK_PATTERN + rb')', IGNORECASE, ).finditer # Sections SECTIONS_FULLMATCH = regex_compile( rb''' (?<section>.*?) (?<section> ^(?<eq>={1,6})[^\n]+?(?P=eq)[ \t]*+$ # header .*? )* # todo: why can't be made possessive? ''', DOTALL | MULTILINE | VERBOSE, ).fullmatch
"""Define the Comment class.""" from typing import Dict, List, MutableSequence, Optional, Union from regex import compile as regex_compile from ._wikitext import SubWikiText from ._spans import COMMENT_PATTERN COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ")*+" COMMENT_COMMA = "(?>" + COMMENT_PATTERN + ")*+'" BOLD_FULLMATCH = regex_compile( COMMA_COMMENT * 2 + "'(.*)'" + COMMENT_COMMA * 2).fullmatch ITALIC_FULLMATCH = regex_compile( COMMA_COMMENT + "'(.*)'" + COMMENT_COMMA).fullmatch ITALIC_NOEND_FULLMATCH = regex_compile( COMMA_COMMENT + "'(.*)").fullmatch class Comment(SubWikiText): __slots__ = () @property def contents(self) -> str: """Return contents of this comment.""" return self(4, -3) @property def comments(self) -> List['Comment']: return []
def tags(self, name=None) -> List['Tag']: """Return all tags with the given name.""" lststr = self._lststr type_to_spans = self._type_to_spans if name: if name in _tag_extensions: string = lststr[0] return [ Tag(lststr, type_to_spans, span, 'ExtensionTag') for span in type_to_spans['ExtensionTag'] if string.startswith('<' + name, span[0]) ] tags = [] # type: List['Tag'] tags_append = tags.append else: # There is no name, add all extension tags. Before using shadow. tags = [ Tag(lststr, type_to_spans, span, 'ExtensionTag') for span in type_to_spans['ExtensionTag'] ] tags_append = tags.append # Get the left-most start tag, match it to right-most end tag # and so on. ss = self._span[0] shadow = self._shadow if name: # There is a name but it is not in TAG_EXTENSIONS. reversed_start_matches = reversed([m for m in regex_compile( START_TAG_PATTERN.replace( rb'{name}', rb'(?P<name>' + name.encode() + rb')' ) ).finditer(shadow)]) end_search = regex_compile(END_TAG_PATTERN .replace( b'{name}', name.encode() )).search else: reversed_start_matches = reversed( [m for m in START_TAG_FINDITER(shadow)] ) shadow_copy = shadow[:] spans = type_to_spans.setdefault('Tag', []) span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get spans_append = spans.append for start_match in reversed_start_matches: if start_match['self_closing']: # Don't look for the end tag s, e = start_match.span() span = [ss + s, ss + e] else: # look for the end-tag if name: # the end_search is already available # noinspection PyUnboundLocalVariable end_match = end_search(shadow_copy, start_match.end()) else: # build end_search according to start tag name end_match = search( END_TAG_PATTERN.replace( b'{name}', start_match['name'] ), shadow_copy, ) if end_match: s, e = end_match.span() shadow_copy[s:e] = b'_' * (e - s) span = [ss + start_match.start(), ss + e] else: # Assume start-only tag. s, e = start_match.span() span = [ss + s, ss + e] old_span = span_tuple_to_span_get((span[0], span[1])) if old_span is None: spans_append(span) else: span = old_span tags_append(Tag(lststr, type_to_spans, span, 'Tag')) return tags
from pytest import mark from regex import compile as regex_compile, VERBOSE, IGNORECASE from lib.urls_authors import byline_to_names, BYLINE_PATTERN, \ BYLINE_TAG_FINDITER BYLINE_PATTERN_REGEX = regex_compile( fr'^{BYLINE_PATTERN}$', IGNORECASE | VERBOSE) def test_byline_pattern_one_author(): """http://www.defense.gov/News/NewsArticle.aspx?ID=18509""" assert BYLINE_PATTERN_REGEX.search('By Jim Garamone') def test_byline_pattern_cap_names_joined_by_and(): """Test two authors with and. Example: https://www.eff.org/deeplinks/2014/06/ sudan-tech-sanctions-harm-innovation-development-us-government-and- corporations-must-act Note the two consecutive spaces. """ assert BYLINE_PATTERN_REGEX.search('By Kimberly Carlson and Jillian York') def test_byline_pattern_four_authors():
#! /usr/bin/python # -*- coding: utf-8 -*- """Codes specifically related to Noormags website.""" from regex import compile as regex_compile from lib.commons import dict_to_sfn_cit_ref, request from lib.bibtex import parse as bibtex_parse BIBTEX_ARTICLE_ID_SEARCH = regex_compile( r'(?<=CitationHandler\.ashx\?id=)\d+').search RIS_ARTICLE_ID_SEARCH = regex_compile(r'(?<=RIS&id=)\d+').search def noorlib_sfn_cit_ref(url: str, date_format: str = '%Y-%m-%d') -> tuple: """Create the response namedtuple.""" dictionary = bibtex_parse(get_bibtex(url)) dictionary['date_format'] = date_format # risr = get_ris(url)[1] # dictionary = risr.parse(ris)[1] return dict_to_sfn_cit_ref(dictionary) def get_bibtex(noorlib_url): """Get bibtex file content from a noormags url. Return as string.""" pagetext = request(noorlib_url).text article_id = BIBTEX_ARTICLE_ID_SEARCH(pagetext)[0] url = 'http://www.noorlib.ir/View/HttpHandler/CitationHandler.ashx?id=' +\ article_id + '&format=BibTex'