def fetch_doggos(): response = fetch('https://dog.ceo/api/breeds/list/all') breed_images = [] for breed in response['message'].keys(): url = f'https://dog.ceo/api/breed/{breed}/images/random' breed_images.append(fetch(url)) return breed_images
def lambda_handler(event, context): for record in event['Records']: callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] wd_page_id = record['messageAttributes']['page_id']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] data = { 'pageId': wd_page_id, 'moduleName': 'forum/ForumCommentsListModule' } haystack = helpers.fetch(data, wikidot_site) # logger.info(haystack) try: thread_id = re.search('(?:forumThreadId = )(\d*)', haystack).group(1) except: # This only really fails on a deleted page. # TODO Make scuttle handle this. return False payload = {"wd_page_id": wd_page_id, "wd_thread_id": int(thread_id)} output = json.dumps(payload) # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/page/thread', data=output, headers=headers) return {'job': 'complete'}
def get_thread_page(thread: int, page: int, wikidot_site: str): data = { 't': thread, 'moduleName': 'forum/ForumViewThreadPostsModule', 'pageNo': page } return helpers.fetch(data, wikidot_site)
def lambda_handler(event, context): for record in event['Records']: callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] wd_page_id = record['messageAttributes']['page_id']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] logger.info(wd_page_id) data = { 'pageId': wd_page_id, 'moduleName': 'pagerate/WhoRatedPageModule' } try: haystack = helpers.fetch(data, wikidot_site) except: # It gone. return {'job': 'article_deleted'} votes = re.findall('(?:#777\">\n)(?:\s*)([12345+-])', haystack) user_ids = re.findall('(?:u=)([^\)]*)', haystack) usernames = re.findall('(?:alt=\")([^\"]*)', haystack) logger.info(str(len(votes)) + " votes found") if len(votes) > 0: innerpayload = {} for row in range(len(user_ids)): innerpayload[row] = ({ 'user_id': user_ids[row], 'username': usernames[row], 'vote': votes[row] }) payload = {"wd_page_id": wd_page_id, "votes": innerpayload} output = json.dumps(payload) # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/page/votes', data=output, headers=headers) if r.status_code == 500: logger.info('500:') logger.info(r.text) return {'job': 'complete'}
def lambda_handler(event, context): for record in event['Records']: # logger.info(record['messageAttributes']) callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] wd_revision_id = record['messageAttributes']['revision_id'][ 'stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] wd_url = record['messageAttributes']['wikidot_url']['stringValue'] # logger.info(wd_revision_id) # logger.info(wikidot_site) data = { 'revision_id': wd_revision_id, 'moduleName': 'history/PageSourceModule' } haystack = helpers.fetch(data, wd_url) if haystack is None: return {'revision': 'deleted '} else: # logger.info('haystack:') # logger.info(haystack) content = re.search( '(?:<div class="page-source">)(.*)(?:<\/div>$)', haystack, re.DOTALL).group(1) payload = { "wd_revision_id": str(wd_revision_id), "content": content } output = json.dumps(payload) # logger.info("got output") # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/revision/content', data=output, headers=headers) if r.status_code is not 200: raise ValueError('SCUTTLE isn\'t well. Returned ' + str(r.status_code)) return {'job': 'complete'}
def lambda_handler(event, context): for record in event['Records']: # We receive a payload from SCUTTLE with a wiki and the most recent slug we have for it. callback_url = record['messageAttributes']['callback_url']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site']['stringValue'] wd_url = record['messageAttributes']['wikidot_url']['stringValue'] slug = record['messageAttributes']['page_slug']['stringValue'] # Get the 20 most recent pages. data = {'order': 'dateCreatedDesc', 'moduleName': 'list/WikiPagesModule', 'limit':20, 'preview':True} haystack = helpers.fetch(data, wd_url) # Get the slugs back. slugs = re.findall('(?:<a href="\/)([^"]*)', haystack) # If the most recent page slug matches the one scuttle sent us, it already knows about it, terminate. if slugs[0] == slug: return { 'job': 'complete' } else: # Otherwise, let's get a stub together for scuttle.
def lambda_handler(event, context): for record in event['Records']: callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] wd_thread_id = record['messageAttributes']['thread_id']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] data = {'t': wd_thread_id, 'moduleName': 'forum/ForumViewThreadModule'} haystack = helpers.fetch(data, wikidot_site) # Do some stuff with the base thread. try: soup = BeautifulSoup(haystack, 'html.parser') except TypeError: # NoneType, it gone. return False # Send this to SCUTTLE. titleblock = soup.find("div", {"class": "forum-breadcrumbs"}) forum = int( re.search('(?:\/forum\/c-)(\d*)', str(titleblock)).group(1)) title = re.search('(?:» (?!<))(.*)', str(titleblock)).group(1) descriptionblock = soup.find("div", {"class": "description-block well"}) # Get the subtitle, which is a surprising amount of effort. if wikidot_site == 'scp-ru': # SCP-RU subtitle = re.findall( '(?:<\/div>)(?:\s*<div class="head">Кратко:<\/div>){0,1}([\s\S]*)(?:<\/div>)', str(descriptionblock), re.MULTILINE) elif wikidot_site == 'lafundacionscp': # SCP-ES subtitle = re.findall( '(?:<\/div>)(?:\s*<div class="head">Resumen:<\/div>){0,1}([\s\S]*)(?:<\/div>)', str(descriptionblock), re.MULTILINE) elif wikidot_site == 'fondationscp': # SCP-FR subtitle = re.findall( '(?:<\/div>)(?:\s*<div class="head">Résumé:<\/div>){0,1}([\s\S]*)(?:<\/div>)', str(descriptionblock), re.MULTILINE) elif wikidot_site == 'scp-wiki-de': # SCP-DE subtitle = re.findall( '(?:<\/div>)(?:\s*<div class="head">Beschreibung:<\/div>){0,1}([\s\S]*)(?:<\/div>)', str(descriptionblock), re.MULTILINE) else: #SCP-EN and English-speaking wikis. subtitle = re.findall( '(?:<\/div>)(?:\s*<div class="head">Summary:<\/div>){0,1}([\s\S]*)(?:<\/div>)', str(descriptionblock), re.MULTILINE) subtitle = ''.join(subtitle) subtitle = subtitle.replace('\n', '').replace( '\t', '' ) # These are artifacts of scraping HTML and not valid in subtitles. if len(subtitle) is 0: subtitle = None # Get the creation timestamp for convenience in sorting later. created_timestamp = int( re.search('(?:odate time_)(\d*)', str(descriptionblock)).group(1)) # Get the OP of the thread. This is Wikidot for a per-page discussion thread or a user id otherwise. attribution = descriptionblock.find("span", {"class": "printuser"}) # logger.info(attribution) if attribution.string == "Wikidot": op_user_id = 0 op_username = "******" else: try: op_user_id = int( re.search('(?:userInfo\()(\d*)', str(attribution)).group(1)) op_username = attribution.text except AttributeError: try: # Deleted Accounts op_user_id = int( re.search('(?:data-id=\")(\d*)', str(attribution)).group(1)) op_username = "******" + str(op_user_id) + ")" except AttributeError: try: # Anonymous Accounts op_user_id = 0 op_username = "******" + str( re.search( '(?:anonymousUserInfo\(\')([\d\.]*)(?:\'\); return false;\"><)', str(attribution)).group(1)) except AttributeError: # Guest Accounts op_user_id = 0 op_username = str( re.search('(?:</a>)([^<]*)', str(attribution)).group(1)) # What we should have back is HTML laying out a page of forum comments. # logger.info('haystack returned:') # logger.info(haystack) # First, let's determine if there are multiple pages. try: maxpages = re.search( '(?:<span class="pager-no">page \d* of )(\d*)', haystack).group(1) maxpages = int(maxpages) except AttributeError: # NoneType means the pager is absent, meaning there's only one page of comments. This is okay. maxpages = 1 # else: # wtf? # logger.info('maxpages returned:') # logger.info(maxpages) # raise Exception('we hit a weird thing with the maxpages, aborting') # logger.info('maxpages returned:') # logger.info(maxpages) # Let's handle things the same way for one page or many. for page in range(maxpages): actualpage = page + 1 # logger.info('On Page ' + str(actualpage)) innerpayload = {} haystack = get_thread_page( thread=wd_thread_id, page=actualpage, wikidot_site=wikidot_site ) # I'm too lazy to not just increment this range by one to make it work. soup = BeautifulSoup(haystack.replace("\\", "")[2:], 'html.parser') posts = soup.find_all("div", id=re.compile("(fpc-)")) # logger.info('posts:') # logger.info(len(posts)) for idx, post in enumerate(posts): wd_post_id = int( re.search('(?:<div class="post" id="post-)(\d*)', str(post)).group(1)) # logger.info("Post " + str(idx) + ", ID " + str(wd_post_id)) subject = re.search( '(?:<div class="title" id="post-title-\d*">\s*)([^\n]*)', str(post)).group(1) # On a blank subject this returns as "</div>" if subject == "</div>": subject = None try: username = re.search( '(?:return false;">)([^<]*)(?:<\/a><\/span>,)', str(post)).group(1) wd_user_id = int( re.search( '(?:www\.wikidot\.com\/userkarma.php\?u=)([^\)]*)', str(post)).group(1)) except AttributeError: #NoneType, deleted user. # logger.info('thread:') # logger.info(wd_thread_id) # logger.info('post:') # logger.info(wd_post_id) try: wd_user_id = int( re.search('(?:data-id=")(\d*)', str(post)).group(1)) username = "******" + str(wd_user_id) except AttributeError: #NoneType, anonymous user! try: wd_user_id = 0 username = "******" + str( re.search( '(?:anonymousUserInfo\(\\\')([\d\.]*)', str(post)).group(1)) + ")" except AttributeError: # One last NoneType, GUEST user holy crap. # logger.info(str(post)) try: username = re.search( '(?:alt=""/></a>)([^>]*)(?:</span>,)', str(post)).group(1) wd_user_id = 0 except AttributeError: # This is getting ridiculous. More guest account types. try: # logger.info(str(post)) username = re.search( '(?:&default=http:\/\/www.wikidot.com/common--images/avatars/default/a16.png&size=16"\/><\/a>)([^>]*)(?:<\/span>,)', str(post)).group(1) wd_user_id = 0 except AttributeError: # Guest with a URL in their name wd_user_id = 0 tempusername = re.search( '(?:rel=\"nofollow\">)([^<]*)(?:<\/a> \(guest\))', str(post)).group(1) username = tempusername + " (guest" post_created_at = int( re.search('(?:<span class="odate time_)([^\s]*)', str(post)).group(1)) content = post.find("div", {"class": "content"}) body = ''.join(str(item) for item in content.contents) body = body[ 1: -1] # Wikidot pads the text with a \n on both sides, which the author didn't write. try: if post.parent['id'] == 'thread-container-posts': # Top-level response parent = 0 else: # 'id' will look like fpc-12345678, take a slice of the string # logger.info('parent:' + post.parent['id']) parent = int(post.parent['id'][4:]) except KeyError: # We're at the root. parent = 0 changespresent = post.find("div", {"class": "revisions"}) if changespresent is not None: # This post was edited, send along a list of revisions and let those get picked up in a different routine. # We're guaranteed at least two entries in here. changes = re.findall('(?:showRevision\(event, )(\d*)', str(changespresent)) else: changes = False innerpayload[idx] = { "wd_post_id": wd_post_id, "wd_user_id": wd_user_id, "parent_id": parent, "subject": subject, "username": username, "timestamp": post_created_at, "changes": changes, "text": body } # logger.info('wd_post_id is a: ') # logger.info(type(wd_post_id)) # logger.info('wd_user_id is a ') # logger.info(type(wd_user_id)) # logger.info('parent_id is a ') # logger.info(type(parent)) # logger.info('subject is a ') # logger.info(type(subject)) # logger.info('username is a ') # logger.info(type(username)) # logger.info('timestamp is a ') # logger.info(type(post_created_at)) # logger.info('changes is a ') # logger.info(type(changes)) # logger.info('text is a ') # logger.info(type(body)) # While we could wait and send one big payload, that's a risky proposition on threads with lots of posts so let's not. # logger.info('out of the loop for a single page') # Wrap the payload and send it, SCUTTLE can sort out posts it already has. outerpayload = { "wd_thread_id": int(wd_thread_id), "wd_forum_id": forum, "wd_user_id": op_user_id, "wd_username": op_username, "title": title, "subtitle": subtitle, "created_at": created_timestamp, "posts": innerpayload } # logger.info('wd_thread_id is a: ') # logger.info(type(wd_thread_id)) # logger.info('wd_forum_id is a ') # logger.info(type(forum)) # logger.info('wd_user_id is a ') # logger.info(type(wd_user_id)) # logger.info('wd_username is a ') # logger.info(type(op_username)) # logger.info('title is a ') # logger.info(type(title)) # logger.info('subtitle is a ') # logger.info(type(subtitle)) # logger.info('created_at is a ') # logger.info(type(created_timestamp)) # logger.info('posts is a ') # logger.info(type(innerpayload)) # Send everything to SCUTTLE output = json.dumps(outerpayload) headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/thread/posts', data=output, headers=headers) # logger.info('Made a SCUTTLE Request!') # logger.info('DATA: ') # logger.info(outerpayload) return {"job": "complete"}
def lambda_handler(event, context): for record in event['Records']: callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] user_id = record['messageAttributes']['user_id']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] # Get the basic info from wikidot. data = {"user_id": user_id, 'moduleName': 'users/UserInfoWinModule'} response = helpers.fetch(data, wikidot_site) # Believe it or not, the next two patterns look for two different things. Thanks Wikidot. # logger.info(response) if wikidot_site == 'scp-ru': #SCP-RU wd_registration_timestamp = re.search( '(?:Wikidot.com с:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'lafundacionscp' or wikidot_site == 'scp-pt-br': # SCP-ES & -PT wd_registration_timestamp = re.search('(?:desde:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'fondationscp': # SCP-FR wd_registration_timestamp = re.search('(?:depuis:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-wiki-de': # SCP-DE wd_registration_timestamp = re.search('(?:seit:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-pl': # SCP-PL wd_registration_timestamp = re.search( '(?:Wikidot.com od:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'fondazionescp': # SCP-IT wd_registration_timestamp = re.search( '(?:Wikidot dal:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-wiki-cn': # SCP-CN wd_registration_timestamp = re.search('(?:使用者始于:)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scpko': # SCP-KO wd_registration_timestamp = re.search( '(?:Wikidot.com 사용자 시작:)(?:\D*)(\d*)', response).group(1) else: # SCP-EN and English-speaking wikis, and a few translated sites (-UA, -CS, ) wd_registration_timestamp = re.search('(?:since:)(?:\D*)(\d*)', response).group(1) try: if wikidot_site == 'scp-ru': # SCP-RU wiki_membership_timestamp = re.search( '(?:сайта: с)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'lafundacionscp' or wikidot_site == 'scp-pt-br': # SCP-ES & -PT wiki_membership_timestamp = re.search( '(?:: desde)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'fondationscp': # SCP-FR wiki_membership_timestamp = re.search( '(?:: depuis :)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-wiki-de': # SCP-DE wiki_membership_timestamp = re.search( '(?:Site: seit)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-ukrainian': # SCP-UA wiki_membership_timestamp = re.search( '(?:сайту: з)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-cs': # SCP-CS wiki_membership_timestamp = re.search( '(?:Stránky: od)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-th': # SCP-TH wiki_membership_timestamp = re.search( '(?:เป็นสมาชิกตั้งแต่)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-pl': # SCP-PL wiki_membership_timestamp = re.search( '(?:projektu: od)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'fondazionescp': # SCP-IT wiki_membership_timestamp = re.search( '(?:sito: dal)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scp-wiki-cn': # SCP-CN wiki_membership_timestamp = re.search( '(?:本站成员:始于)(?:\D*)(\d*)', response).group(1) elif wikidot_site == 'scpko': # SCP-KO wiki_membership_timestamp = re.search( '(?:이 사이트의 회원 시작 시간:)(?:\D*)(\d*)', response).group(1) else: # SCP-EN and English-speaking wikis. wiki_membership_timestamp = re.search( '(?:: since)(?:\D*)(\d*)', response).group(1) except AttributeError: # Altogether possible this user is no longer a member. We'll send a boolean false. wiki_membership_timestamp = False username = re.search('(?:<h1>)(.*)(?:<\/h1>)', response).group(1) # Download the user's avatar as a file object. r_avatar = requests.get('http://www.wikidot.com/avatar.php?userid=' + user_id) avatar = r_avatar.content # Bytes-like object here. # Upload the avatar to s3 s3 = boto3.client('s3') upload = s3.put_object(Bucket="scuttle-s3", Body=avatar, Key="avatars/wikidot/" + str(user_id)) # Give SCUTTLE back the data requested and a link to the file. payload = { "wd_user_id": user_id, "username": username, "wd_user_since": wd_registration_timestamp, "avatar_path": "https://cdn.scpfoundation.wiki/avatars/wikidot/" + user_id, "wiki_member_since": wiki_membership_timestamp } # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } j = json.dumps(payload) r = requests.put(callback_url + '/2stacks/user/metadata', data=j, headers=headers) return {'job': 'complete'}
def wsdl_parse(self, url, cache=False): "Parse Web Service Description v1.1" log.debug("wsdl url: %s" % url) # Try to load a previously parsed wsdl: force_download = False if cache: # make md5 hash of the url for caching... filename_pkl = "%s.pkl" % hashlib.md5(url).hexdigest() if isinstance(cache, basestring): filename_pkl = os.path.join(cache, filename_pkl) if os.path.exists(filename_pkl): log.debug("Unpickle file %s" % (filename_pkl, )) f = open(filename_pkl, "r") pkl = pickle.load(f) f.close() # sanity check: if pkl['version'][:-1] != __version__.split( " ")[0][:-1] or pkl['url'] != url: import warnings warnings.warn( 'version or url mismatch! discarding cached wsdl', RuntimeWarning) log.debug('Version: %s %s' % (pkl['version'], __version__)) log.debug('URL: %s %s' % (pkl['url'], url)) force_download = True else: self.namespace = pkl['namespace'] self.documentation = pkl['documentation'] return pkl['services'] soap_ns = { "http://schemas.xmlsoap.org/wsdl/soap/": 'soap11', "http://schemas.xmlsoap.org/wsdl/soap12/": 'soap12', } wsdl_uri = "http://schemas.xmlsoap.org/wsdl/" xsd_uri = "http://www.w3.org/2001/XMLSchema" xsi_uri = "http://www.w3.org/2001/XMLSchema-instance" get_local_name = lambda s: s and str( (':' in s) and s.split(':')[1] or s) get_namespace_prefix = lambda s: s and str( (':' in s) and s.split(':')[0] or None) # always return an unicode object: REVERSE_TYPE_MAP[u'string'] = unicode # Open uri and read xml: xml = fetch(url, self.http, cache, force_download, self.wsdl_basedir) # Parse WSDL XML: wsdl = SimpleXMLElement(xml, namespace=wsdl_uri) # detect soap prefix and uri (xmlns attributes of <definitions>) xsd_ns = None soap_uris = {} for k, v in wsdl[:]: if v in soap_ns and k.startswith("xmlns:"): soap_uris[get_local_name(k)] = v if v == xsd_uri and k.startswith("xmlns:"): xsd_ns = get_local_name(k) # Extract useful data: self.namespace = wsdl['targetNamespace'] self.documentation = unicode(wsdl('documentation', error=False) or '') services = {} bindings = {} # binding_name: binding operations = {} # operation_name: operation port_type_bindings = {} # port_type_name: binding messages = {} # message: element elements = {} # element: type def for service in wsdl.service: service_name = service['name'] if not service_name: continue # empty service? log.debug("Processing service %s" % service_name) serv = services.setdefault(service_name, {'ports': {}}) serv['documentation'] = service['documentation'] or '' for port in service.port: binding_name = get_local_name(port['binding']) operations[binding_name] = {} address = port('address', ns=soap_uris.values(), error=False) location = address and address['location'] or None soap_uri = address and soap_uris.get(address.get_prefix()) soap_ver = soap_uri and soap_ns.get(soap_uri) bindings[binding_name] = { 'name': binding_name, 'service_name': service_name, 'location': location, 'soap_uri': soap_uri, 'soap_ver': soap_ver, } serv['ports'][port['name']] = bindings[binding_name] for binding in wsdl.binding: binding_name = binding['name'] soap_binding = binding('binding', ns=soap_uris.values(), error=False) transport = soap_binding and soap_binding['transport'] or None port_type_name = get_local_name(binding['type']) bindings[binding_name].update({ 'port_type_name': port_type_name, 'transport': transport, 'operations': {}, }) if port_type_name not in port_type_bindings: port_type_bindings[port_type_name] = [] port_type_bindings[port_type_name].append(bindings[binding_name]) for operation in binding.operation: op_name = operation['name'] op = operation('operation', ns=soap_uris.values(), error=False) action = op and op['soapAction'] d = operations[binding_name].setdefault(op_name, {}) bindings[binding_name]['operations'][op_name] = d d.update({'name': op_name}) d['parts'] = {} # input and/or ouput can be not present! input = operation('input', error=False) body = input and input( 'body', ns=soap_uris.values(), error=False) d['parts']['input_body'] = body and body['parts'] or None output = operation('output', error=False) body = output and output( 'body', ns=soap_uris.values(), error=False) d['parts']['output_body'] = body and body['parts'] or None header = input and input( 'header', ns=soap_uris.values(), error=False) d['parts']['input_header'] = header and { 'message': header['message'], 'part': header['part'] } or None header = output and output( 'header', ns=soap_uris.values(), error=False) d['parts']['output_header'] = header and { 'message': header['message'], 'part': header['part'] } or None if action: d["action"] = action # check axis2 namespace at schema types attributes self.namespace = dict(wsdl.types("schema", ns=xsd_uri)[:]).get( 'targetNamespace', self.namespace) imported_schemas = {} # process current wsdl schema: for schema in wsdl.types("schema", ns=xsd_uri): preprocess_schema(schema, imported_schemas, elements, xsd_uri, self.__soap_server, self.http, cache, force_download, self.wsdl_basedir) postprocess_element(elements) for message in wsdl.message: log.debug("Processing message %s" % message['name']) for part in message('part', error=False) or []: element = {} element_name = part['element'] if not element_name: # some implementations (axis) uses type instead element_name = part['type'] type_ns = get_namespace_prefix(element_name) type_uri = wsdl.get_namespace_uri(type_ns) if type_uri == xsd_uri: element_name = get_local_name(element_name) fn = REVERSE_TYPE_MAP.get(unicode(element_name), None) element = {part['name']: fn} # emulate a true Element (complexType) messages.setdefault((message['name'], None), { message['name']: OrderedDict() }).values()[0].update(element) else: element_name = get_local_name(element_name) fn = elements.get(make_key(element_name, 'element')) if not fn: # some axis servers uses complexType for part messages fn = elements.get(make_key(element_name, 'complexType')) element = {message['name']: {part['name']: fn}} else: element = {element_name: fn} messages[(message['name'], part['name'])] = element for port_type in wsdl.portType: port_type_name = port_type['name'] log.debug("Processing port type %s" % port_type_name) for binding in port_type_bindings[port_type_name]: for operation in port_type.operation: op_name = operation['name'] op = operations[str(binding['name'])][op_name] op['documentation'] = unicode( operation('documentation', error=False) or '') if binding['soap_ver']: #TODO: separe operation_binding from operation (non SOAP?) if operation("input", error=False): input_msg = get_local_name( operation.input['message']) input_header = op['parts'].get('input_header') if input_header: header_msg = get_local_name( input_header.get('message')) header_part = get_local_name( input_header.get('part')) # warning: some implementations use a separate message! header = get_message(messages, header_msg or input_msg, header_part) else: header = None # not enought info to search the header message: op['input'] = get_message( messages, input_msg, op['parts'].get('input_body')) op['header'] = header else: op['input'] = None op['header'] = None if operation("output", error=False): output_msg = get_local_name( operation.output['message']) op['output'] = get_message( messages, output_msg, op['parts'].get('output_body')) else: op['output'] = None # dump the full service/port/operation map #log.debug(pprint.pformat(services)) # Save parsed wsdl (cache) if cache: f = open(filename_pkl, "wb") pkl = { 'version': __version__.split(" ")[0], 'url': url, 'namespace': self.namespace, 'documentation': self.documentation, 'services': services, } pickle.dump(pkl, f) f.close() return services
def lambda_handler(event, context): for record in event['Records']: callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] wd_page_id = record['messageAttributes']['page_id']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] # logger.info(wikidot_site) # logger.info(wd_page_id) data = { 'page_id': wd_page_id, 'moduleName': 'history/PageRevisionListModule', 'perpage': 99999 } haystack = helpers.fetch(data, wikidot_site) if haystack is None: # Page was deleted before the task fired. return False revision_ids = re.findall('(?:<tr id="revision-row-)(\d*)', haystack) revision_numbers = re.findall('(?:<td>)(\d*)(?:.<\/td>)', haystack) usernames = re.findall('(?:alt=")([^"]*)', haystack) user_ids = re.findall( '((?:userInfo\()([^\)]*)(?:\); return false;" )|(?:data-id=")(\d*)|(?:UserInfo\(\\\')([\d\|\.]*)(?:\\\'\); return false;\" ><))', haystack) timestamps = re.findall('(?:<span class="odate time_)([^ ]*)', haystack) # The revision type can be empty! Old tag actions didn't have an associated revision type # The unicode points in here if we need them later, are Thai (0E00-037F) revision_type = re.findall( '((?:<span class="spantip" title="(?:[\D \/])*">)(\w)(?:<\/span>)|(?:<td>)(?:\\n\\t\\t\\t\\t\\t \\t\\t\\t \\t\\t\\t \\t\\t\\t \\t \\n\\t\\t \\t \\n\\t\\t \\t \\n\\t\\t<)(\/)(?:td>))', haystack) comments = re.findall('(?:<td style="font-size: 90%">)([^<]*)', haystack) # logger.info(wd_page_id) # logger.info(len(revision_ids)) # logger.info(len(revision_numbers)) # logger.info(len(usernames)) # logger.info(len(user_ids)) # logger.info(len(timestamps)) # logger.info(len(revision_type)) # logger.info(len(comments)) # Clean up the match object we made for user_ids. for idx, user in enumerate(user_ids): user_ids[idx] = user[1:] # Remove the non-matching object for idx, user in enumerate(user_ids): user_ids[idx] = ''.join( user) # Flatten the tuple to one string object. # Clean up the match object we made for revision_type. for idx, revision in enumerate(revision_type): revision_type[idx] = revision[1:] # Remove the non-matching object for idx, revision in enumerate(revision_type): revision_type[idx] = ''.join( revision) # Flatten the tuple to one string object. innerpayload = {} # logger.info(str(len(revision_ids)) + " revisions.") # logger.info(str(len(revision_type)) + " revision type rows.") # for row in range(len(revision_type)): # logger.info(revision_type[row]) for row in range(len(revision_ids)): # logger.info("Processing revision " + revision_numbers[row]) # We need to handle some edge cases for deleted and anonymous users. if len(usernames[row]) == 0: #This can be either a deleted or anonymous account if "." in user_ids[row]: #Anonymous account usernames[row] = "Anonymous User (" + str( user_ids[row]) + ")" user_ids[row] = 0 else: #Deleted Account usernames[row] = "Deleted Account (" + str( user_ids[row]) + ")" if revision_type[row] == "/": revision_type[row] = "A" innerpayload[row] = ({ 'revision_id': revision_ids[row], 'username': usernames[row], 'user_id': user_ids[row], 'timestamp': timestamps[row], 'revision_type': revision_type[row], 'revision_number': revision_numbers[row], 'comments': comments[row] }) payload = {"wd_page_id": wd_page_id, "revisions": innerpayload} output = json.dumps(payload) # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/page/revisions', data=output, headers=headers) # if r.status_code == 500: # logger.info('500:') # logger.info(r.text) return {'job': 'complete'}
def get_latest_release_tag(): fetch() tags = run("git tag").split('\n') release_tags = sorted(ReleaseTag.parse(t) for t in tags if release_tag_pattern.match(t)) return str(release_tags[-1])
def wsdl_parse(self, url, cache=False): "Parse Web Service Description v1.1" log.debug("wsdl url: %s" % url) # Try to load a previously parsed wsdl: force_download = False if cache: # make md5 hash of the url for caching... filename_pkl = "%s.pkl" % hashlib.md5(url).hexdigest() if isinstance(cache, basestring): filename_pkl = os.path.join(cache, filename_pkl) if os.path.exists(filename_pkl): log.debug("Unpickle file %s" % (filename_pkl,)) f = open(filename_pkl, "r") pkl = pickle.load(f) f.close() # sanity check: if pkl["version"][:-1] != __version__.split(" ")[0][:-1] or pkl["url"] != url: import warnings warnings.warn("version or url mismatch! discarding cached wsdl", RuntimeWarning) log.debug("Version: %s %s" % (pkl["version"], __version__)) log.debug("URL: %s %s" % (pkl["url"], url)) force_download = True else: self.namespace = pkl["namespace"] self.documentation = pkl["documentation"] return pkl["services"] soap_ns = { "http://schemas.xmlsoap.org/wsdl/soap/": "soap11", "http://schemas.xmlsoap.org/wsdl/soap12/": "soap12", } wsdl_uri = "http://schemas.xmlsoap.org/wsdl/" xsd_uri = "http://www.w3.org/2001/XMLSchema" xsi_uri = "http://www.w3.org/2001/XMLSchema-instance" get_local_name = lambda s: s and str((":" in s) and s.split(":")[1] or s) get_namespace_prefix = lambda s: s and str((":" in s) and s.split(":")[0] or None) # always return an unicode object: REVERSE_TYPE_MAP[u"string"] = unicode # Open uri and read xml: xml = fetch(url, self.http, cache, force_download, self.wsdl_basedir) # Parse WSDL XML: wsdl = SimpleXMLElement(xml, namespace=wsdl_uri) # detect soap prefix and uri (xmlns attributes of <definitions>) xsd_ns = None soap_uris = {} for k, v in wsdl[:]: if v in soap_ns and k.startswith("xmlns:"): soap_uris[get_local_name(k)] = v if v == xsd_uri and k.startswith("xmlns:"): xsd_ns = get_local_name(k) # Extract useful data: self.namespace = wsdl["targetNamespace"] self.documentation = unicode(wsdl("documentation", error=False) or "") services = {} bindings = {} # binding_name: binding operations = {} # operation_name: operation port_type_bindings = {} # port_type_name: binding messages = {} # message: element elements = {} # element: type def for service in wsdl.service: service_name = service["name"] if not service_name: continue # empty service? log.debug("Processing service %s" % service_name) serv = services.setdefault(service_name, {"ports": {}}) serv["documentation"] = service["documentation"] or "" for port in service.port: binding_name = get_local_name(port["binding"]) operations[binding_name] = {} address = port("address", ns=soap_uris.values(), error=False) location = address and address["location"] or None soap_uri = address and soap_uris.get(address.get_prefix()) soap_ver = soap_uri and soap_ns.get(soap_uri) bindings[binding_name] = { "name": binding_name, "service_name": service_name, "location": location, "soap_uri": soap_uri, "soap_ver": soap_ver, } serv["ports"][port["name"]] = bindings[binding_name] for binding in wsdl.binding: binding_name = binding["name"] soap_binding = binding("binding", ns=soap_uris.values(), error=False) transport = soap_binding and soap_binding["transport"] or None port_type_name = get_local_name(binding["type"]) bindings[binding_name].update({"port_type_name": port_type_name, "transport": transport, "operations": {}}) if port_type_name not in port_type_bindings: port_type_bindings[port_type_name] = [] port_type_bindings[port_type_name].append(bindings[binding_name]) for operation in binding.operation: op_name = operation["name"] op = operation("operation", ns=soap_uris.values(), error=False) action = op and op["soapAction"] d = operations[binding_name].setdefault(op_name, {}) bindings[binding_name]["operations"][op_name] = d d.update({"name": op_name}) d["parts"] = {} # input and/or ouput can be not present! input = operation("input", error=False) body = input and input("body", ns=soap_uris.values(), error=False) d["parts"]["input_body"] = body and body["parts"] or None output = operation("output", error=False) body = output and output("body", ns=soap_uris.values(), error=False) d["parts"]["output_body"] = body and body["parts"] or None header = input and input("header", ns=soap_uris.values(), error=False) d["parts"]["input_header"] = header and {"message": header["message"], "part": header["part"]} or None header = output and output("header", ns=soap_uris.values(), error=False) d["parts"]["output_header"] = header and {"message": header["message"], "part": header["part"]} or None if action: d["action"] = action # check axis2 namespace at schema types attributes self.namespace = dict(wsdl.types("schema", ns=xsd_uri)[:]).get("targetNamespace", self.namespace) imported_schemas = {} # process current wsdl schema: for schema in wsdl.types("schema", ns=xsd_uri): preprocess_schema( schema, imported_schemas, elements, xsd_uri, self.__soap_server, self.http, cache, force_download, self.wsdl_basedir, ) postprocess_element(elements) for message in wsdl.message: log.debug("Processing message %s" % message["name"]) for part in message("part", error=False) or []: element = {} element_name = part["element"] if not element_name: # some implementations (axis) uses type instead element_name = part["type"] type_ns = get_namespace_prefix(element_name) type_uri = wsdl.get_namespace_uri(type_ns) if type_uri == xsd_uri: element_name = get_local_name(element_name) fn = REVERSE_TYPE_MAP.get(unicode(element_name), None) element = {part["name"]: fn} # emulate a true Element (complexType) messages.setdefault((message["name"], None), {message["name"]: OrderedDict()}).values()[0].update( element ) else: element_name = get_local_name(element_name) fn = elements.get(make_key(element_name, "element")) if not fn: # some axis servers uses complexType for part messages fn = elements.get(make_key(element_name, "complexType")) element = {message["name"]: {part["name"]: fn}} else: element = {element_name: fn} messages[(message["name"], part["name"])] = element for port_type in wsdl.portType: port_type_name = port_type["name"] log.debug("Processing port type %s" % port_type_name) for binding in port_type_bindings[port_type_name]: for operation in port_type.operation: op_name = operation["name"] op = operations[str(binding["name"])][op_name] op["documentation"] = unicode(operation("documentation", error=False) or "") if binding["soap_ver"]: # TODO: separe operation_binding from operation (non SOAP?) if operation("input", error=False): input_msg = get_local_name(operation.input["message"]) input_header = op["parts"].get("input_header") if input_header: header_msg = get_local_name(input_header.get("message")) header_part = get_local_name(input_header.get("part")) # warning: some implementations use a separate message! header = get_message(messages, header_msg or input_msg, header_part) else: header = None # not enought info to search the header message: op["input"] = get_message(messages, input_msg, op["parts"].get("input_body")) op["header"] = header else: op["input"] = None op["header"] = None if operation("output", error=False): output_msg = get_local_name(operation.output["message"]) op["output"] = get_message(messages, output_msg, op["parts"].get("output_body")) else: op["output"] = None # dump the full service/port/operation map # log.debug(pprint.pformat(services)) # Save parsed wsdl (cache) if cache: f = open(filename_pkl, "wb") pkl = { "version": __version__.split(" ")[0], "url": url, "namespace": self.namespace, "documentation": self.documentation, "services": services, } pickle.dump(pkl, f) f.close() return services
def lambda_handler(event, context): for record in event['Records']: callback_url = record['messageAttributes']['callback_url'][ 'stringValue'] forum_id = record['messageAttributes']['forum_id']['stringValue'] wikidot_site = record['messageAttributes']['wikidot_site'][ 'stringValue'] # logger.info('Fetching forum ' + forum_id + ' for ' + wikidot_site) page_no = 1 data = { 'c': forum_id, 'p': page_no, 'moduleName': 'forum/ForumViewCategoryModule' } haystack = helpers.fetch(data, wikidot_site) try: threads = re.findall( '(?:\n\t\t\t\t\t\t\t\t\t\t\t\t<a href="\/forum\/t-)([^\/]*)', haystack) if wikidot_site == 'fondationscp': # SCP-FR pages = re.findall( '(?:<span class="pager-no">page 1 de )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. elif wikidot_site == 'scp-wiki-de': # SCP-DE pages = re.findall( '(?:<span class="pager-no">Seite 1 von )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. elif wikidot_site == 'scp-pl': # SCP-PL pages = re.findall( '(?:<span class="pager-no">strona 1 z )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. elif wikidot_site == 'scp-pt-br': # SCP-PT pages = re.findall( '(?:<span class="pager-no">página 1 do )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. elif wikidot_site == 'fondazionescp': # SCP-IT pages = re.findall( '(?:<span class="pager-no">pagina 1 di )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. elif wikidot_site == 'scpko': # SCP-KO pages = re.findall( '(?:<span class="pager-no">페이지: 1 / )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. else: # SCP-EN and English-speaking wikis (Some -INT sites didn't have this translated, like -RU, -UA, -CN...) pages = re.findall( '(?:<span class="pager-no">page 1 of )(\d*)', haystack ) # This technically returns 2 indistinguishable objects because Wikidot. # logger.info('There are ' + str(pages) + ' pages of threads to look through.') except: # This only really fails on a deleted page. # TODO Make scuttle handle this. return False payload = {"wd_forum_id": forum_id, "threads": threads} output = json.dumps(payload) # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/forum/threads', data=output, headers=headers) if not pages: # The Pythonic™ way of checking if a list is empty. return {'job': 'complete'} else: for page_no in range(int(pages[0])): page_no += 1 data = { 'c': forum_id, 'p': page_no, 'moduleName': 'forum/ForumViewCategoryModule' } haystack = helpers.fetch(data, wikidot_site) try: threads = re.findall( '(?:\n\t\t\t\t\t\t\t\t\t\t\t\t<a href="\/forum\/t-)([^\/]*)', haystack) except: # This only really fails on a deleted page. # TODO Make scuttle handle this. return False payload = {"wd_forum_id": forum_id, "threads": threads} output = json.dumps(payload) # logger.info('Sending page ' + str(page_no) + ' to SCUTTLE') # Send everything to SCUTTLE headers = { "Authorization": "Bearer " + config.scuttle_token, "Content-Type": "application/json" } r = requests.put(callback_url + '/2stacks/forum/threads', data=output, headers=headers) return {'job': 'complete'}
problems = tag_tickets(tickets, new_tag) if problems: print("Error updating YouTrack:") for problem in problems: print(problem) print("") if __name__ == "__main__": args = docopt(__doc__) test_run = args["--test-run"] if not (git_is_clean() or test_run): print("Git status reports as not clean; aborting making release") else: fetch() latest_tag = get_latest_release_tag() print("The latest release was " + latest_tag) branches_and_tickets = check_tickets(latest_tag) new_tag = get_new_tag(latest_tag) print("* Writing release log") release_message = make_release_message(new_tag, branches_and_tickets) write_release_log(release_message) commit_and_tag() update_youtrack(branches_and_tickets, test_run) print( """--------------------------------------------------------------- Completed successfully. No changes have been pushed, so please review and then