Пример #1
0
def fetch_doggos():
    response = fetch('https://dog.ceo/api/breeds/list/all')
    breed_images = []
    for breed in response['message'].keys():
        url = f'https://dog.ceo/api/breed/{breed}/images/random'
        breed_images.append(fetch(url))
    return breed_images
Пример #2
0
def lambda_handler(event, context):
    for record in event['Records']:
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        wd_page_id = record['messageAttributes']['page_id']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']

        data = {
            'pageId': wd_page_id,
            'moduleName': 'forum/ForumCommentsListModule'
        }
        haystack = helpers.fetch(data, wikidot_site)
        # logger.info(haystack)
        try:
            thread_id = re.search('(?:forumThreadId = )(\d*)',
                                  haystack).group(1)
        except:  # This only really fails on a deleted page.
            # TODO Make scuttle handle this.
            return False

        payload = {"wd_page_id": wd_page_id, "wd_thread_id": int(thread_id)}
        output = json.dumps(payload)

        #  Send everything to SCUTTLE
        headers = {
            "Authorization": "Bearer " + config.scuttle_token,
            "Content-Type": "application/json"
        }
        r = requests.put(callback_url + '/2stacks/page/thread',
                         data=output,
                         headers=headers)

    return {'job': 'complete'}
Пример #3
0
def get_thread_page(thread: int, page: int, wikidot_site: str):
    data = {
        't': thread,
        'moduleName': 'forum/ForumViewThreadPostsModule',
        'pageNo': page
    }
    return helpers.fetch(data, wikidot_site)
Пример #4
0
def lambda_handler(event, context):
    for record in event['Records']:
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        wd_page_id = record['messageAttributes']['page_id']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']

        logger.info(wd_page_id)
        data = {
            'pageId': wd_page_id,
            'moduleName': 'pagerate/WhoRatedPageModule'
        }
        try:
            haystack = helpers.fetch(data, wikidot_site)
        except:  # It gone.
            return {'job': 'article_deleted'}
        votes = re.findall('(?:#777\">\n)(?:\s*)([12345+-])', haystack)
        user_ids = re.findall('(?:u=)([^\)]*)', haystack)
        usernames = re.findall('(?:alt=\")([^\"]*)', haystack)

        logger.info(str(len(votes)) + " votes found")

        if len(votes) > 0:

            innerpayload = {}
            for row in range(len(user_ids)):
                innerpayload[row] = ({
                    'user_id': user_ids[row],
                    'username': usernames[row],
                    'vote': votes[row]
                })
            payload = {"wd_page_id": wd_page_id, "votes": innerpayload}
            output = json.dumps(payload)

            #  Send everything to SCUTTLE
            headers = {
                "Authorization": "Bearer " + config.scuttle_token,
                "Content-Type": "application/json"
            }
            r = requests.put(callback_url + '/2stacks/page/votes',
                             data=output,
                             headers=headers)
            if r.status_code == 500:
                logger.info('500:')
                logger.info(r.text)

    return {'job': 'complete'}
Пример #5
0
def lambda_handler(event, context):
    for record in event['Records']:
        # logger.info(record['messageAttributes'])
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        wd_revision_id = record['messageAttributes']['revision_id'][
            'stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']
        wd_url = record['messageAttributes']['wikidot_url']['stringValue']

        # logger.info(wd_revision_id)
        # logger.info(wikidot_site)

        data = {
            'revision_id': wd_revision_id,
            'moduleName': 'history/PageSourceModule'
        }
        haystack = helpers.fetch(data, wd_url)
        if haystack is None:
            return {'revision': 'deleted '}
        else:
            # logger.info('haystack:')
            # logger.info(haystack)
            content = re.search(
                '(?:<div class="page-source">)(.*)(?:<\/div>$)', haystack,
                re.DOTALL).group(1)
            payload = {
                "wd_revision_id": str(wd_revision_id),
                "content": content
            }
            output = json.dumps(payload)
            # logger.info("got output")
            #  Send everything to SCUTTLE
            headers = {
                "Authorization": "Bearer " + config.scuttle_token,
                "Content-Type": "application/json"
            }
            r = requests.put(callback_url + '/2stacks/revision/content',
                             data=output,
                             headers=headers)
            if r.status_code is not 200:
                raise ValueError('SCUTTLE isn\'t well. Returned ' +
                                 str(r.status_code))
    return {'job': 'complete'}
Пример #6
0
def lambda_handler(event, context):
    for record in event['Records']:
        # We receive a payload from SCUTTLE with a wiki and the most recent slug we have for it.
        callback_url = record['messageAttributes']['callback_url']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site']['stringValue']
        wd_url = record['messageAttributes']['wikidot_url']['stringValue']
        slug = record['messageAttributes']['page_slug']['stringValue']
        
        # Get the 20 most recent pages.
        data = {'order': 'dateCreatedDesc', 'moduleName': 'list/WikiPagesModule', 'limit':20, 'preview':True}
        haystack = helpers.fetch(data, wd_url)
        
        # Get the slugs back.
        slugs = re.findall('(?:<a href="\/)([^"]*)', haystack)
        
        # If the most recent page slug matches the one scuttle sent us, it already knows about it, terminate.
        if slugs[0] == slug:
            return { 'job': 'complete' }
        else:
            # Otherwise, let's get a stub together for scuttle.
Пример #7
0
def lambda_handler(event, context):
    for record in event['Records']:
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        wd_thread_id = record['messageAttributes']['thread_id']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']

        data = {'t': wd_thread_id, 'moduleName': 'forum/ForumViewThreadModule'}
        haystack = helpers.fetch(data, wikidot_site)

        # Do some stuff with the base thread.
        try:
            soup = BeautifulSoup(haystack, 'html.parser')
        except TypeError:  # NoneType, it gone.
            return False  # Send this to SCUTTLE.
        titleblock = soup.find("div", {"class": "forum-breadcrumbs"})
        forum = int(
            re.search('(?:\/forum\/c-)(\d*)', str(titleblock)).group(1))
        title = re.search('(?:» (?!<))(.*)', str(titleblock)).group(1)
        descriptionblock = soup.find("div",
                                     {"class": "description-block well"})
        # Get the subtitle, which is a surprising amount of effort.
        if wikidot_site == 'scp-ru':  # SCP-RU
            subtitle = re.findall(
                '(?:<\/div>)(?:\s*<div class="head">Кратко:<\/div>){0,1}([\s\S]*)(?:<\/div>)',
                str(descriptionblock), re.MULTILINE)
        elif wikidot_site == 'lafundacionscp':  # SCP-ES
            subtitle = re.findall(
                '(?:<\/div>)(?:\s*<div class="head">Resumen:<\/div>){0,1}([\s\S]*)(?:<\/div>)',
                str(descriptionblock), re.MULTILINE)
        elif wikidot_site == 'fondationscp':  # SCP-FR
            subtitle = re.findall(
                '(?:<\/div>)(?:\s*<div class="head">Résumé:<\/div>){0,1}([\s\S]*)(?:<\/div>)',
                str(descriptionblock), re.MULTILINE)
        elif wikidot_site == 'scp-wiki-de':  # SCP-DE
            subtitle = re.findall(
                '(?:<\/div>)(?:\s*<div class="head">Beschreibung:<\/div>){0,1}([\s\S]*)(?:<\/div>)',
                str(descriptionblock), re.MULTILINE)
        else:  #SCP-EN and English-speaking wikis.
            subtitle = re.findall(
                '(?:<\/div>)(?:\s*<div class="head">Summary:<\/div>){0,1}([\s\S]*)(?:<\/div>)',
                str(descriptionblock), re.MULTILINE)
        subtitle = ''.join(subtitle)
        subtitle = subtitle.replace('\n', '').replace(
            '\t', ''
        )  # These are artifacts of scraping HTML and not valid in subtitles.
        if len(subtitle) is 0:
            subtitle = None

        # Get the creation timestamp for convenience in sorting later.
        created_timestamp = int(
            re.search('(?:odate time_)(\d*)', str(descriptionblock)).group(1))

        # Get the OP of the thread. This is Wikidot for a per-page discussion thread or a user id otherwise.
        attribution = descriptionblock.find("span", {"class": "printuser"})
        # logger.info(attribution)
        if attribution.string == "Wikidot":
            op_user_id = 0
            op_username = "******"
        else:
            try:
                op_user_id = int(
                    re.search('(?:userInfo\()(\d*)',
                              str(attribution)).group(1))
                op_username = attribution.text
            except AttributeError:
                try:
                    # Deleted Accounts
                    op_user_id = int(
                        re.search('(?:data-id=\")(\d*)',
                                  str(attribution)).group(1))
                    op_username = "******" + str(op_user_id) + ")"
                except AttributeError:
                    try:
                        # Anonymous Accounts
                        op_user_id = 0
                        op_username = "******" + str(
                            re.search(
                                '(?:anonymousUserInfo\(\')([\d\.]*)(?:\'\); return false;\"><)',
                                str(attribution)).group(1))
                    except AttributeError:
                        # Guest Accounts
                        op_user_id = 0
                        op_username = str(
                            re.search('(?:</a>)([^<]*)',
                                      str(attribution)).group(1))

        # What we should have back is HTML laying out a page of forum comments.
        # logger.info('haystack returned:')
        # logger.info(haystack)

        # First, let's determine if there are multiple pages.
        try:
            maxpages = re.search(
                '(?:<span class="pager-no">page \d* of )(\d*)',
                haystack).group(1)
            maxpages = int(maxpages)
        except AttributeError:  # NoneType means the pager is absent, meaning there's only one page of comments. This is okay.
            maxpages = 1
        # else:  # wtf?
        # logger.info('maxpages returned:')
        # logger.info(maxpages)
        # raise Exception('we hit a weird thing with the maxpages, aborting')

        # logger.info('maxpages returned:')
        # logger.info(maxpages)

        # Let's handle things the same way for one page or many.
        for page in range(maxpages):
            actualpage = page + 1
            # logger.info('On Page ' + str(actualpage))
            innerpayload = {}
            haystack = get_thread_page(
                thread=wd_thread_id,
                page=actualpage,
                wikidot_site=wikidot_site
            )  # I'm too lazy to not just increment this range by one to make it work.
            soup = BeautifulSoup(haystack.replace("\\", "")[2:], 'html.parser')
            posts = soup.find_all("div", id=re.compile("(fpc-)"))
            # logger.info('posts:')
            # logger.info(len(posts))
            for idx, post in enumerate(posts):
                wd_post_id = int(
                    re.search('(?:<div class="post" id="post-)(\d*)',
                              str(post)).group(1))
                # logger.info("Post " + str(idx) + ", ID " + str(wd_post_id))
                subject = re.search(
                    '(?:<div class="title" id="post-title-\d*">\s*)([^\n]*)',
                    str(post)).group(1)
                # On a blank subject this returns as "</div>"
                if subject == "</div>":
                    subject = None
                try:
                    username = re.search(
                        '(?:return false;">)([^<]*)(?:<\/a><\/span>,)',
                        str(post)).group(1)
                    wd_user_id = int(
                        re.search(
                            '(?:www\.wikidot\.com\/userkarma.php\?u=)([^\)]*)',
                            str(post)).group(1))
                except AttributeError:  #NoneType, deleted user.
                    # logger.info('thread:')
                    # logger.info(wd_thread_id)
                    # logger.info('post:')
                    # logger.info(wd_post_id)
                    try:
                        wd_user_id = int(
                            re.search('(?:data-id=")(\d*)',
                                      str(post)).group(1))
                        username = "******" + str(wd_user_id)
                    except AttributeError:  #NoneType, anonymous user!
                        try:
                            wd_user_id = 0
                            username = "******" + str(
                                re.search(
                                    '(?:anonymousUserInfo\(\\\')([\d\.]*)',
                                    str(post)).group(1)) + ")"
                        except AttributeError:  # One last NoneType, GUEST user holy crap.
                            # logger.info(str(post))
                            try:
                                username = re.search(
                                    '(?:alt=""/></a>)([^>]*)(?:</span>,)',
                                    str(post)).group(1)
                                wd_user_id = 0
                            except AttributeError:  # This is getting ridiculous. More guest account types.
                                try:
                                    # logger.info(str(post))
                                    username = re.search(
                                        '(?:&amp;default=http:\/\/www.wikidot.com/common--images/avatars/default/a16.png&amp;size=16"\/><\/a>)([^>]*)(?:<\/span>,)',
                                        str(post)).group(1)
                                    wd_user_id = 0
                                except AttributeError:
                                    # Guest with a URL in their name
                                    wd_user_id = 0
                                    tempusername = re.search(
                                        '(?:rel=\"nofollow\">)([^<]*)(?:<\/a> \(guest\))',
                                        str(post)).group(1)
                                    username = tempusername + " (guest"
                post_created_at = int(
                    re.search('(?:<span class="odate time_)([^\s]*)',
                              str(post)).group(1))

                content = post.find("div", {"class": "content"})
                body = ''.join(str(item) for item in content.contents)
                body = body[
                    1:
                    -1]  # Wikidot pads the text with a \n on both sides, which the author didn't write.
                try:
                    if post.parent['id'] == 'thread-container-posts':
                        # Top-level response
                        parent = 0
                    else:
                        # 'id' will look like fpc-12345678, take a slice of the string
                        # logger.info('parent:' + post.parent['id'])
                        parent = int(post.parent['id'][4:])
                except KeyError:  # We're at the root.
                    parent = 0
                changespresent = post.find("div", {"class": "revisions"})
                if changespresent is not None:
                    # This post was edited, send along a list of revisions and let those get picked up in a different routine.
                    # We're guaranteed at least two entries in here.
                    changes = re.findall('(?:showRevision\(event, )(\d*)',
                                         str(changespresent))

                else:
                    changes = False

                innerpayload[idx] = {
                    "wd_post_id": wd_post_id,
                    "wd_user_id": wd_user_id,
                    "parent_id": parent,
                    "subject": subject,
                    "username": username,
                    "timestamp": post_created_at,
                    "changes": changes,
                    "text": body
                }
                # logger.info('wd_post_id is a: ')
                # logger.info(type(wd_post_id))
                # logger.info('wd_user_id is a ')
                # logger.info(type(wd_user_id))
                # logger.info('parent_id is a ')
                # logger.info(type(parent))
                # logger.info('subject is a ')
                # logger.info(type(subject))
                # logger.info('username is a ')
                # logger.info(type(username))
                # logger.info('timestamp is a ')
                # logger.info(type(post_created_at))
                # logger.info('changes is a ')
                # logger.info(type(changes))
                # logger.info('text is a ')
                # logger.info(type(body))
            # While we could wait and send one big payload, that's a risky proposition on threads with lots of posts so let's not.
            # logger.info('out of the loop for a single page')

            # Wrap the payload and send it, SCUTTLE can sort out posts it already has.
            outerpayload = {
                "wd_thread_id": int(wd_thread_id),
                "wd_forum_id": forum,
                "wd_user_id": op_user_id,
                "wd_username": op_username,
                "title": title,
                "subtitle": subtitle,
                "created_at": created_timestamp,
                "posts": innerpayload
            }
            # logger.info('wd_thread_id is a: ')
            # logger.info(type(wd_thread_id))
            # logger.info('wd_forum_id is a ')
            # logger.info(type(forum))
            # logger.info('wd_user_id is a ')
            # logger.info(type(wd_user_id))
            # logger.info('wd_username is a ')
            # logger.info(type(op_username))
            # logger.info('title is a ')
            # logger.info(type(title))
            # logger.info('subtitle is a ')
            # logger.info(type(subtitle))
            # logger.info('created_at is a ')
            # logger.info(type(created_timestamp))
            # logger.info('posts is a ')
            # logger.info(type(innerpayload))

            #  Send everything to SCUTTLE
            output = json.dumps(outerpayload)
            headers = {
                "Authorization": "Bearer " + config.scuttle_token,
                "Content-Type": "application/json"
            }
            r = requests.put(callback_url + '/2stacks/thread/posts',
                             data=output,
                             headers=headers)
            # logger.info('Made a SCUTTLE Request!')
            # logger.info('DATA: ')
            # logger.info(outerpayload)

    return {"job": "complete"}
Пример #8
0
def lambda_handler(event, context):
    for record in event['Records']:
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        user_id = record['messageAttributes']['user_id']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']

        # Get the basic info from wikidot.
        data = {"user_id": user_id, 'moduleName': 'users/UserInfoWinModule'}
        response = helpers.fetch(data, wikidot_site)

        # Believe it or not, the next two patterns look for two different things. Thanks Wikidot.
        # logger.info(response)
        if wikidot_site == 'scp-ru':  #SCP-RU
            wd_registration_timestamp = re.search(
                '(?:Wikidot.com с:)(?:\D*)(\d*)', response).group(1)
        elif wikidot_site == 'lafundacionscp' or wikidot_site == 'scp-pt-br':  # SCP-ES & -PT
            wd_registration_timestamp = re.search('(?:desde:)(?:\D*)(\d*)',
                                                  response).group(1)
        elif wikidot_site == 'fondationscp':  # SCP-FR
            wd_registration_timestamp = re.search('(?:depuis:)(?:\D*)(\d*)',
                                                  response).group(1)
        elif wikidot_site == 'scp-wiki-de':  # SCP-DE
            wd_registration_timestamp = re.search('(?:seit:)(?:\D*)(\d*)',
                                                  response).group(1)
        elif wikidot_site == 'scp-pl':  # SCP-PL
            wd_registration_timestamp = re.search(
                '(?:Wikidot.com od:)(?:\D*)(\d*)', response).group(1)
        elif wikidot_site == 'fondazionescp':  # SCP-IT
            wd_registration_timestamp = re.search(
                '(?:Wikidot dal:)(?:\D*)(\d*)', response).group(1)
        elif wikidot_site == 'scp-wiki-cn':  # SCP-CN
            wd_registration_timestamp = re.search('(?:使用者始于:)(?:\D*)(\d*)',
                                                  response).group(1)
        elif wikidot_site == 'scpko':  # SCP-KO
            wd_registration_timestamp = re.search(
                '(?:Wikidot.com 사용자 시작:)(?:\D*)(\d*)', response).group(1)
        else:  # SCP-EN and English-speaking wikis, and a few translated sites (-UA, -CS, )
            wd_registration_timestamp = re.search('(?:since:)(?:\D*)(\d*)',
                                                  response).group(1)

        try:
            if wikidot_site == 'scp-ru':  # SCP-RU
                wiki_membership_timestamp = re.search(
                    '(?:сайта: с)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'lafundacionscp' or wikidot_site == 'scp-pt-br':  # SCP-ES & -PT
                wiki_membership_timestamp = re.search(
                    '(?:: desde)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'fondationscp':  # SCP-FR
                wiki_membership_timestamp = re.search(
                    '(?:: depuis :)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scp-wiki-de':  # SCP-DE
                wiki_membership_timestamp = re.search(
                    '(?:Site: seit)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scp-ukrainian':  # SCP-UA
                wiki_membership_timestamp = re.search(
                    '(?:сайту: з)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scp-cs':  # SCP-CS
                wiki_membership_timestamp = re.search(
                    '(?:Stránky: od)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scp-th':  # SCP-TH
                wiki_membership_timestamp = re.search(
                    '(?:เป็นสมาชิกตั้งแต่)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scp-pl':  # SCP-PL
                wiki_membership_timestamp = re.search(
                    '(?:projektu: od)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'fondazionescp':  # SCP-IT
                wiki_membership_timestamp = re.search(
                    '(?:sito: dal)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scp-wiki-cn':  # SCP-CN
                wiki_membership_timestamp = re.search(
                    '(?:本站成员:始于)(?:\D*)(\d*)', response).group(1)
            elif wikidot_site == 'scpko':  # SCP-KO
                wiki_membership_timestamp = re.search(
                    '(?:이 사이트의 회원 시작 시간:)(?:\D*)(\d*)', response).group(1)
            else:  # SCP-EN and English-speaking wikis.
                wiki_membership_timestamp = re.search(
                    '(?:: since)(?:\D*)(\d*)', response).group(1)
        except AttributeError:
            # Altogether possible this user is no longer a member. We'll send a boolean false.
            wiki_membership_timestamp = False

        username = re.search('(?:<h1>)(.*)(?:<\/h1>)', response).group(1)

        # Download the user's avatar as a file object.
        r_avatar = requests.get('http://www.wikidot.com/avatar.php?userid=' +
                                user_id)
        avatar = r_avatar.content  # Bytes-like object here.

        # Upload the avatar to s3
        s3 = boto3.client('s3')
        upload = s3.put_object(Bucket="scuttle-s3",
                               Body=avatar,
                               Key="avatars/wikidot/" + str(user_id))
        # Give SCUTTLE back the data requested and a link to the file.
        payload = {
            "wd_user_id": user_id,
            "username": username,
            "wd_user_since": wd_registration_timestamp,
            "avatar_path":
            "https://cdn.scpfoundation.wiki/avatars/wikidot/" + user_id,
            "wiki_member_since": wiki_membership_timestamp
        }

        #  Send everything to SCUTTLE
        headers = {
            "Authorization": "Bearer " + config.scuttle_token,
            "Content-Type": "application/json"
        }
        j = json.dumps(payload)
        r = requests.put(callback_url + '/2stacks/user/metadata',
                         data=j,
                         headers=headers)

    return {'job': 'complete'}
Пример #9
0
    def wsdl_parse(self, url, cache=False):
        "Parse Web Service Description v1.1"

        log.debug("wsdl url: %s" % url)
        # Try to load a previously parsed wsdl:
        force_download = False
        if cache:
            # make md5 hash of the url for caching...
            filename_pkl = "%s.pkl" % hashlib.md5(url).hexdigest()
            if isinstance(cache, basestring):
                filename_pkl = os.path.join(cache, filename_pkl)
            if os.path.exists(filename_pkl):
                log.debug("Unpickle file %s" % (filename_pkl, ))
                f = open(filename_pkl, "r")
                pkl = pickle.load(f)
                f.close()
                # sanity check:
                if pkl['version'][:-1] != __version__.split(
                        " ")[0][:-1] or pkl['url'] != url:
                    import warnings
                    warnings.warn(
                        'version or url mismatch! discarding cached wsdl',
                        RuntimeWarning)
                    log.debug('Version: %s %s' % (pkl['version'], __version__))
                    log.debug('URL: %s %s' % (pkl['url'], url))
                    force_download = True
                else:
                    self.namespace = pkl['namespace']
                    self.documentation = pkl['documentation']
                    return pkl['services']

        soap_ns = {
            "http://schemas.xmlsoap.org/wsdl/soap/": 'soap11',
            "http://schemas.xmlsoap.org/wsdl/soap12/": 'soap12',
        }
        wsdl_uri = "http://schemas.xmlsoap.org/wsdl/"
        xsd_uri = "http://www.w3.org/2001/XMLSchema"
        xsi_uri = "http://www.w3.org/2001/XMLSchema-instance"

        get_local_name = lambda s: s and str(
            (':' in s) and s.split(':')[1] or s)
        get_namespace_prefix = lambda s: s and str(
            (':' in s) and s.split(':')[0] or None)

        # always return an unicode object:
        REVERSE_TYPE_MAP[u'string'] = unicode

        # Open uri and read xml:
        xml = fetch(url, self.http, cache, force_download, self.wsdl_basedir)
        # Parse WSDL XML:
        wsdl = SimpleXMLElement(xml, namespace=wsdl_uri)

        # detect soap prefix and uri (xmlns attributes of <definitions>)
        xsd_ns = None
        soap_uris = {}
        for k, v in wsdl[:]:
            if v in soap_ns and k.startswith("xmlns:"):
                soap_uris[get_local_name(k)] = v
            if v == xsd_uri and k.startswith("xmlns:"):
                xsd_ns = get_local_name(k)

        # Extract useful data:
        self.namespace = wsdl['targetNamespace']
        self.documentation = unicode(wsdl('documentation', error=False) or '')

        services = {}
        bindings = {}  # binding_name: binding
        operations = {}  # operation_name: operation
        port_type_bindings = {}  # port_type_name: binding
        messages = {}  # message: element
        elements = {}  # element: type def

        for service in wsdl.service:
            service_name = service['name']
            if not service_name:
                continue  # empty service?
            log.debug("Processing service %s" % service_name)
            serv = services.setdefault(service_name, {'ports': {}})
            serv['documentation'] = service['documentation'] or ''
            for port in service.port:
                binding_name = get_local_name(port['binding'])
                operations[binding_name] = {}
                address = port('address', ns=soap_uris.values(), error=False)
                location = address and address['location'] or None
                soap_uri = address and soap_uris.get(address.get_prefix())
                soap_ver = soap_uri and soap_ns.get(soap_uri)
                bindings[binding_name] = {
                    'name': binding_name,
                    'service_name': service_name,
                    'location': location,
                    'soap_uri': soap_uri,
                    'soap_ver': soap_ver,
                }
                serv['ports'][port['name']] = bindings[binding_name]

        for binding in wsdl.binding:
            binding_name = binding['name']
            soap_binding = binding('binding',
                                   ns=soap_uris.values(),
                                   error=False)
            transport = soap_binding and soap_binding['transport'] or None
            port_type_name = get_local_name(binding['type'])
            bindings[binding_name].update({
                'port_type_name': port_type_name,
                'transport': transport,
                'operations': {},
            })
            if port_type_name not in port_type_bindings:
                port_type_bindings[port_type_name] = []
            port_type_bindings[port_type_name].append(bindings[binding_name])
            for operation in binding.operation:
                op_name = operation['name']
                op = operation('operation', ns=soap_uris.values(), error=False)
                action = op and op['soapAction']
                d = operations[binding_name].setdefault(op_name, {})
                bindings[binding_name]['operations'][op_name] = d
                d.update({'name': op_name})
                d['parts'] = {}
                # input and/or ouput can be not present!
                input = operation('input', error=False)
                body = input and input(
                    'body', ns=soap_uris.values(), error=False)
                d['parts']['input_body'] = body and body['parts'] or None
                output = operation('output', error=False)
                body = output and output(
                    'body', ns=soap_uris.values(), error=False)
                d['parts']['output_body'] = body and body['parts'] or None
                header = input and input(
                    'header', ns=soap_uris.values(), error=False)
                d['parts']['input_header'] = header and {
                    'message': header['message'],
                    'part': header['part']
                } or None
                header = output and output(
                    'header', ns=soap_uris.values(), error=False)
                d['parts']['output_header'] = header and {
                    'message': header['message'],
                    'part': header['part']
                } or None
                if action:
                    d["action"] = action

        # check axis2 namespace at schema types attributes
        self.namespace = dict(wsdl.types("schema", ns=xsd_uri)[:]).get(
            'targetNamespace', self.namespace)

        imported_schemas = {}

        # process current wsdl schema:
        for schema in wsdl.types("schema", ns=xsd_uri):
            preprocess_schema(schema, imported_schemas, elements, xsd_uri,
                              self.__soap_server, self.http, cache,
                              force_download, self.wsdl_basedir)

        postprocess_element(elements)

        for message in wsdl.message:
            log.debug("Processing message %s" % message['name'])
            for part in message('part', error=False) or []:
                element = {}
                element_name = part['element']
                if not element_name:
                    # some implementations (axis) uses type instead
                    element_name = part['type']
                type_ns = get_namespace_prefix(element_name)
                type_uri = wsdl.get_namespace_uri(type_ns)
                if type_uri == xsd_uri:
                    element_name = get_local_name(element_name)
                    fn = REVERSE_TYPE_MAP.get(unicode(element_name), None)
                    element = {part['name']: fn}
                    # emulate a true Element (complexType)
                    messages.setdefault((message['name'], None), {
                        message['name']: OrderedDict()
                    }).values()[0].update(element)
                else:
                    element_name = get_local_name(element_name)
                    fn = elements.get(make_key(element_name, 'element'))
                    if not fn:
                        # some axis servers uses complexType for part messages
                        fn = elements.get(make_key(element_name,
                                                   'complexType'))
                        element = {message['name']: {part['name']: fn}}
                    else:
                        element = {element_name: fn}
                    messages[(message['name'], part['name'])] = element

        for port_type in wsdl.portType:
            port_type_name = port_type['name']
            log.debug("Processing port type %s" % port_type_name)

            for binding in port_type_bindings[port_type_name]:
                for operation in port_type.operation:
                    op_name = operation['name']
                    op = operations[str(binding['name'])][op_name]
                    op['documentation'] = unicode(
                        operation('documentation', error=False) or '')
                    if binding['soap_ver']:
                        #TODO: separe operation_binding from operation (non SOAP?)
                        if operation("input", error=False):
                            input_msg = get_local_name(
                                operation.input['message'])
                            input_header = op['parts'].get('input_header')
                            if input_header:
                                header_msg = get_local_name(
                                    input_header.get('message'))
                                header_part = get_local_name(
                                    input_header.get('part'))
                                # warning: some implementations use a separate message!
                                header = get_message(messages, header_msg
                                                     or input_msg, header_part)
                            else:
                                header = None  # not enought info to search the header message:
                            op['input'] = get_message(
                                messages, input_msg,
                                op['parts'].get('input_body'))
                            op['header'] = header
                        else:
                            op['input'] = None
                            op['header'] = None
                        if operation("output", error=False):
                            output_msg = get_local_name(
                                operation.output['message'])
                            op['output'] = get_message(
                                messages, output_msg,
                                op['parts'].get('output_body'))
                        else:
                            op['output'] = None

        # dump the full service/port/operation map
        #log.debug(pprint.pformat(services))

        # Save parsed wsdl (cache)
        if cache:
            f = open(filename_pkl, "wb")
            pkl = {
                'version': __version__.split(" ")[0],
                'url': url,
                'namespace': self.namespace,
                'documentation': self.documentation,
                'services': services,
            }
            pickle.dump(pkl, f)
            f.close()

        return services
Пример #10
0
def lambda_handler(event, context):
    for record in event['Records']:
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        wd_page_id = record['messageAttributes']['page_id']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']
        # logger.info(wikidot_site)
        # logger.info(wd_page_id)

        data = {
            'page_id': wd_page_id,
            'moduleName': 'history/PageRevisionListModule',
            'perpage': 99999
        }
        haystack = helpers.fetch(data, wikidot_site)
        if haystack is None:  # Page was deleted before the task fired.
            return False
        revision_ids = re.findall('(?:<tr id="revision-row-)(\d*)', haystack)
        revision_numbers = re.findall('(?:<td>)(\d*)(?:.<\/td>)', haystack)
        usernames = re.findall('(?:alt=")([^"]*)', haystack)
        user_ids = re.findall(
            '((?:userInfo\()([^\)]*)(?:\); return false;"  )|(?:data-id=")(\d*)|(?:UserInfo\(\\\')([\d\|\.]*)(?:\\\'\); return false;\" ><))',
            haystack)
        timestamps = re.findall('(?:<span class="odate time_)([^ ]*)',
                                haystack)
        # The revision type can be empty! Old tag actions didn't have an associated revision type
        # The unicode points in here if we need them later, are Thai (0E00-037F)
        revision_type = re.findall(
            '((?:<span class="spantip" title="(?:[\D \/])*">)(\w)(?:<\/span>)|(?:<td>)(?:\\n\\t\\t\\t\\t\\t \\t\\t\\t \\t\\t\\t \\t\\t\\t \\t  \\n\\t\\t \\t  \\n\\t\\t \\t \\n\\t\\t<)(\/)(?:td>))',
            haystack)
        comments = re.findall('(?:<td style="font-size: 90%">)([^<]*)',
                              haystack)
        # logger.info(wd_page_id)
        # logger.info(len(revision_ids))
        # logger.info(len(revision_numbers))
        # logger.info(len(usernames))
        # logger.info(len(user_ids))
        # logger.info(len(timestamps))
        # logger.info(len(revision_type))
        # logger.info(len(comments))

        # Clean up the match object we made for user_ids.
        for idx, user in enumerate(user_ids):
            user_ids[idx] = user[1:]  # Remove the non-matching object
        for idx, user in enumerate(user_ids):
            user_ids[idx] = ''.join(
                user)  # Flatten the tuple to one string object.

        # Clean up the match object we made for revision_type.
        for idx, revision in enumerate(revision_type):
            revision_type[idx] = revision[1:]  # Remove the non-matching object
        for idx, revision in enumerate(revision_type):
            revision_type[idx] = ''.join(
                revision)  # Flatten the tuple to one string object.

        innerpayload = {}
        # logger.info(str(len(revision_ids)) + " revisions.")
        # logger.info(str(len(revision_type)) + " revision type rows.")
        # for row in range(len(revision_type)):
        #     logger.info(revision_type[row])
        for row in range(len(revision_ids)):
            # logger.info("Processing revision " + revision_numbers[row])
            # We need to handle some edge cases for deleted and anonymous users.
            if len(usernames[row]) == 0:
                #This can be either a deleted or anonymous account
                if "." in user_ids[row]:
                    #Anonymous account
                    usernames[row] = "Anonymous User (" + str(
                        user_ids[row]) + ")"
                    user_ids[row] = 0
                else:
                    #Deleted Account
                    usernames[row] = "Deleted Account (" + str(
                        user_ids[row]) + ")"
            if revision_type[row] == "/":
                revision_type[row] = "A"
            innerpayload[row] = ({
                'revision_id': revision_ids[row],
                'username': usernames[row],
                'user_id': user_ids[row],
                'timestamp': timestamps[row],
                'revision_type': revision_type[row],
                'revision_number': revision_numbers[row],
                'comments': comments[row]
            })
        payload = {"wd_page_id": wd_page_id, "revisions": innerpayload}
        output = json.dumps(payload)

        #  Send everything to SCUTTLE
        headers = {
            "Authorization": "Bearer " + config.scuttle_token,
            "Content-Type": "application/json"
        }
        r = requests.put(callback_url + '/2stacks/page/revisions',
                         data=output,
                         headers=headers)
        # if r.status_code == 500:
        #     logger.info('500:')
        #     logger.info(r.text)

    return {'job': 'complete'}
Пример #11
0
def get_latest_release_tag():
    fetch()
    tags = run("git tag").split('\n')
    release_tags = sorted(ReleaseTag.parse(t) for t in tags if release_tag_pattern.match(t))
    return str(release_tags[-1])
Пример #12
0
    def wsdl_parse(self, url, cache=False):
        "Parse Web Service Description v1.1"

        log.debug("wsdl url: %s" % url)
        # Try to load a previously parsed wsdl:
        force_download = False
        if cache:
            # make md5 hash of the url for caching...
            filename_pkl = "%s.pkl" % hashlib.md5(url).hexdigest()
            if isinstance(cache, basestring):
                filename_pkl = os.path.join(cache, filename_pkl)
            if os.path.exists(filename_pkl):
                log.debug("Unpickle file %s" % (filename_pkl,))
                f = open(filename_pkl, "r")
                pkl = pickle.load(f)
                f.close()
                # sanity check:
                if pkl["version"][:-1] != __version__.split(" ")[0][:-1] or pkl["url"] != url:
                    import warnings

                    warnings.warn("version or url mismatch! discarding cached wsdl", RuntimeWarning)
                    log.debug("Version: %s %s" % (pkl["version"], __version__))
                    log.debug("URL: %s %s" % (pkl["url"], url))
                    force_download = True
                else:
                    self.namespace = pkl["namespace"]
                    self.documentation = pkl["documentation"]
                    return pkl["services"]

        soap_ns = {
            "http://schemas.xmlsoap.org/wsdl/soap/": "soap11",
            "http://schemas.xmlsoap.org/wsdl/soap12/": "soap12",
        }
        wsdl_uri = "http://schemas.xmlsoap.org/wsdl/"
        xsd_uri = "http://www.w3.org/2001/XMLSchema"
        xsi_uri = "http://www.w3.org/2001/XMLSchema-instance"

        get_local_name = lambda s: s and str((":" in s) and s.split(":")[1] or s)
        get_namespace_prefix = lambda s: s and str((":" in s) and s.split(":")[0] or None)

        # always return an unicode object:
        REVERSE_TYPE_MAP[u"string"] = unicode

        # Open uri and read xml:
        xml = fetch(url, self.http, cache, force_download, self.wsdl_basedir)
        # Parse WSDL XML:
        wsdl = SimpleXMLElement(xml, namespace=wsdl_uri)

        # detect soap prefix and uri (xmlns attributes of <definitions>)
        xsd_ns = None
        soap_uris = {}
        for k, v in wsdl[:]:
            if v in soap_ns and k.startswith("xmlns:"):
                soap_uris[get_local_name(k)] = v
            if v == xsd_uri and k.startswith("xmlns:"):
                xsd_ns = get_local_name(k)

        # Extract useful data:
        self.namespace = wsdl["targetNamespace"]
        self.documentation = unicode(wsdl("documentation", error=False) or "")

        services = {}
        bindings = {}  # binding_name: binding
        operations = {}  # operation_name: operation
        port_type_bindings = {}  # port_type_name: binding
        messages = {}  # message: element
        elements = {}  # element: type def

        for service in wsdl.service:
            service_name = service["name"]
            if not service_name:
                continue  # empty service?
            log.debug("Processing service %s" % service_name)
            serv = services.setdefault(service_name, {"ports": {}})
            serv["documentation"] = service["documentation"] or ""
            for port in service.port:
                binding_name = get_local_name(port["binding"])
                operations[binding_name] = {}
                address = port("address", ns=soap_uris.values(), error=False)
                location = address and address["location"] or None
                soap_uri = address and soap_uris.get(address.get_prefix())
                soap_ver = soap_uri and soap_ns.get(soap_uri)
                bindings[binding_name] = {
                    "name": binding_name,
                    "service_name": service_name,
                    "location": location,
                    "soap_uri": soap_uri,
                    "soap_ver": soap_ver,
                }
                serv["ports"][port["name"]] = bindings[binding_name]

        for binding in wsdl.binding:
            binding_name = binding["name"]
            soap_binding = binding("binding", ns=soap_uris.values(), error=False)
            transport = soap_binding and soap_binding["transport"] or None
            port_type_name = get_local_name(binding["type"])
            bindings[binding_name].update({"port_type_name": port_type_name, "transport": transport, "operations": {}})
            if port_type_name not in port_type_bindings:
                port_type_bindings[port_type_name] = []
            port_type_bindings[port_type_name].append(bindings[binding_name])
            for operation in binding.operation:
                op_name = operation["name"]
                op = operation("operation", ns=soap_uris.values(), error=False)
                action = op and op["soapAction"]
                d = operations[binding_name].setdefault(op_name, {})
                bindings[binding_name]["operations"][op_name] = d
                d.update({"name": op_name})
                d["parts"] = {}
                # input and/or ouput can be not present!
                input = operation("input", error=False)
                body = input and input("body", ns=soap_uris.values(), error=False)
                d["parts"]["input_body"] = body and body["parts"] or None
                output = operation("output", error=False)
                body = output and output("body", ns=soap_uris.values(), error=False)
                d["parts"]["output_body"] = body and body["parts"] or None
                header = input and input("header", ns=soap_uris.values(), error=False)
                d["parts"]["input_header"] = header and {"message": header["message"], "part": header["part"]} or None
                header = output and output("header", ns=soap_uris.values(), error=False)
                d["parts"]["output_header"] = header and {"message": header["message"], "part": header["part"]} or None
                if action:
                    d["action"] = action

        # check axis2 namespace at schema types attributes
        self.namespace = dict(wsdl.types("schema", ns=xsd_uri)[:]).get("targetNamespace", self.namespace)

        imported_schemas = {}

        # process current wsdl schema:
        for schema in wsdl.types("schema", ns=xsd_uri):
            preprocess_schema(
                schema,
                imported_schemas,
                elements,
                xsd_uri,
                self.__soap_server,
                self.http,
                cache,
                force_download,
                self.wsdl_basedir,
            )

        postprocess_element(elements)

        for message in wsdl.message:
            log.debug("Processing message %s" % message["name"])
            for part in message("part", error=False) or []:
                element = {}
                element_name = part["element"]
                if not element_name:
                    # some implementations (axis) uses type instead
                    element_name = part["type"]
                type_ns = get_namespace_prefix(element_name)
                type_uri = wsdl.get_namespace_uri(type_ns)
                if type_uri == xsd_uri:
                    element_name = get_local_name(element_name)
                    fn = REVERSE_TYPE_MAP.get(unicode(element_name), None)
                    element = {part["name"]: fn}
                    # emulate a true Element (complexType)
                    messages.setdefault((message["name"], None), {message["name"]: OrderedDict()}).values()[0].update(
                        element
                    )
                else:
                    element_name = get_local_name(element_name)
                    fn = elements.get(make_key(element_name, "element"))
                    if not fn:
                        # some axis servers uses complexType for part messages
                        fn = elements.get(make_key(element_name, "complexType"))
                        element = {message["name"]: {part["name"]: fn}}
                    else:
                        element = {element_name: fn}
                    messages[(message["name"], part["name"])] = element

        for port_type in wsdl.portType:
            port_type_name = port_type["name"]
            log.debug("Processing port type %s" % port_type_name)

            for binding in port_type_bindings[port_type_name]:
                for operation in port_type.operation:
                    op_name = operation["name"]
                    op = operations[str(binding["name"])][op_name]
                    op["documentation"] = unicode(operation("documentation", error=False) or "")
                    if binding["soap_ver"]:
                        # TODO: separe operation_binding from operation (non SOAP?)
                        if operation("input", error=False):
                            input_msg = get_local_name(operation.input["message"])
                            input_header = op["parts"].get("input_header")
                            if input_header:
                                header_msg = get_local_name(input_header.get("message"))
                                header_part = get_local_name(input_header.get("part"))
                                # warning: some implementations use a separate message!
                                header = get_message(messages, header_msg or input_msg, header_part)
                            else:
                                header = None  # not enought info to search the header message:
                            op["input"] = get_message(messages, input_msg, op["parts"].get("input_body"))
                            op["header"] = header
                        else:
                            op["input"] = None
                            op["header"] = None
                        if operation("output", error=False):
                            output_msg = get_local_name(operation.output["message"])
                            op["output"] = get_message(messages, output_msg, op["parts"].get("output_body"))
                        else:
                            op["output"] = None

        # dump the full service/port/operation map
        # log.debug(pprint.pformat(services))

        # Save parsed wsdl (cache)
        if cache:
            f = open(filename_pkl, "wb")
            pkl = {
                "version": __version__.split(" ")[0],
                "url": url,
                "namespace": self.namespace,
                "documentation": self.documentation,
                "services": services,
            }
            pickle.dump(pkl, f)
            f.close()

        return services
Пример #13
0
def lambda_handler(event, context):
    for record in event['Records']:
        callback_url = record['messageAttributes']['callback_url'][
            'stringValue']
        forum_id = record['messageAttributes']['forum_id']['stringValue']
        wikidot_site = record['messageAttributes']['wikidot_site'][
            'stringValue']
        # logger.info('Fetching forum ' + forum_id + ' for ' + wikidot_site)

        page_no = 1

        data = {
            'c': forum_id,
            'p': page_no,
            'moduleName': 'forum/ForumViewCategoryModule'
        }
        haystack = helpers.fetch(data, wikidot_site)
        try:
            threads = re.findall(
                '(?:\n\t\t\t\t\t\t\t\t\t\t\t\t<a href="\/forum\/t-)([^\/]*)',
                haystack)
            if wikidot_site == 'fondationscp':  # SCP-FR
                pages = re.findall(
                    '(?:<span class="pager-no">page 1 de )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            elif wikidot_site == 'scp-wiki-de':  # SCP-DE
                pages = re.findall(
                    '(?:<span class="pager-no">Seite 1 von )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            elif wikidot_site == 'scp-pl':  # SCP-PL
                pages = re.findall(
                    '(?:<span class="pager-no">strona 1 z )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            elif wikidot_site == 'scp-pt-br':  # SCP-PT
                pages = re.findall(
                    '(?:<span class="pager-no">página 1 do )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            elif wikidot_site == 'fondazionescp':  # SCP-IT
                pages = re.findall(
                    '(?:<span class="pager-no">pagina 1 di )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            elif wikidot_site == 'scpko':  # SCP-KO
                pages = re.findall(
                    '(?:<span class="pager-no">페이지: 1 / )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            else:  # SCP-EN and English-speaking wikis (Some -INT sites didn't have this translated, like -RU, -UA, -CN...)
                pages = re.findall(
                    '(?:<span class="pager-no">page 1 of )(\d*)', haystack
                )  # This technically returns 2 indistinguishable objects because Wikidot.
            # logger.info('There are ' + str(pages) + ' pages of threads to look through.')
        except:  # This only really fails on a deleted page.
            # TODO Make scuttle handle this.
            return False
        payload = {"wd_forum_id": forum_id, "threads": threads}
        output = json.dumps(payload)

        #  Send everything to SCUTTLE
        headers = {
            "Authorization": "Bearer " + config.scuttle_token,
            "Content-Type": "application/json"
        }
        r = requests.put(callback_url + '/2stacks/forum/threads',
                         data=output,
                         headers=headers)

        if not pages:  # The Pythonic™ way of checking if a list is empty.
            return {'job': 'complete'}

        else:
            for page_no in range(int(pages[0])):
                page_no += 1
                data = {
                    'c': forum_id,
                    'p': page_no,
                    'moduleName': 'forum/ForumViewCategoryModule'
                }
                haystack = helpers.fetch(data, wikidot_site)
                try:
                    threads = re.findall(
                        '(?:\n\t\t\t\t\t\t\t\t\t\t\t\t<a href="\/forum\/t-)([^\/]*)',
                        haystack)

                except:  # This only really fails on a deleted page.
                    # TODO Make scuttle handle this.
                    return False
                payload = {"wd_forum_id": forum_id, "threads": threads}
                output = json.dumps(payload)
                # logger.info('Sending page ' + str(page_no) + ' to SCUTTLE')
                #  Send everything to SCUTTLE
                headers = {
                    "Authorization": "Bearer " + config.scuttle_token,
                    "Content-Type": "application/json"
                }
                r = requests.put(callback_url + '/2stacks/forum/threads',
                                 data=output,
                                 headers=headers)

    return {'job': 'complete'}
Пример #14
0
    problems = tag_tickets(tickets, new_tag)
    if problems:
        print("Error updating YouTrack:")
        for problem in problems:
            print(problem)
        print("")


if __name__ == "__main__":
    args = docopt(__doc__)
    test_run = args["--test-run"]

    if not (git_is_clean() or test_run):
        print("Git status reports as not clean; aborting making release")
    else:
        fetch()
        latest_tag = get_latest_release_tag()
        print("The latest release was " + latest_tag)

        branches_and_tickets = check_tickets(latest_tag)
        new_tag = get_new_tag(latest_tag)

        print("* Writing release log")
        release_message = make_release_message(new_tag, branches_and_tickets)
        write_release_log(release_message)
        commit_and_tag()
        update_youtrack(branches_and_tickets, test_run)

        print(
            """---------------------------------------------------------------
Completed successfully. No changes have been pushed, so please review and then