Exemplo n.º 1
0
def comment_info(comment):
    comment_name = u'comment_{0}'.format(comment['id'])
    if not Entity.by_name(comment_name):
        print("Caching new comment {0}".format(comment_name))
        entity = Entity(comment_name)
        entity[u'body'] = comment['body']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(comment_name)
Exemplo n.º 2
0
def comment_info(comment):
    comment_name = u'comment_{0}'.format(comment['id'])
    if not Entity.by_name(comment_name):
        print("Caching new comment {0}".format(comment_name))
        entity = Entity(comment_name)
        entity[u'body'] = comment['body']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(comment_name)
Exemplo n.º 3
0
def issue_info(issue):
    issue_name = u'issue_{0}'.format(issue['id'])
    if not Entity.by_name(issue_name):
        print("Caching new issue {0}".format(issue_name))
        entity = Entity(issue_name)
        entity[u'title'] = issue['title']
        entity[u'number'] = issue['number']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(issue_name)
Exemplo n.º 4
0
def issue_info(issue):
    issue_name = u'issue_{0}'.format(issue['id'])
    if not Entity.by_name(issue_name):
        print("Caching new issue {0}".format(issue_name))
        entity = Entity(issue_name)
        entity[u'title'] = issue['title']
        entity[u'number'] = issue['number']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(issue_name)
Exemplo n.º 5
0
    def build_repo(self, repo_id, repo_info):
        repo = Entity.by_name(repo_id)
        owner = Entity.by_name(repo['owner'])

        text = ["{0} is a cool project!".format(repo['name'])]
        for user, count in repo_info.items():
            if user == 'count':
                continue
            text.append("{0} made {1} contributions this week.".format(
                Entity.by_name(user)['name'], count))

        self.finish(owner, text)
Exemplo n.º 6
0
    def build_repo(self, repo_id, repo_info):
        repo = Entity.by_name(repo_id)
        owner = Entity.by_name(repo['owner'])

        text = ["{0} is a cool project!".format(repo['name'])]
        for user, count in repo_info.items():
            if user == 'count':
                continue
            text.append("{0} made {1} contributions this week."
                .format(Entity.by_name(user)['name'], count))

        self.finish(owner, text)
Exemplo n.º 7
0
def inject_knowledge():
    #SunlightAPI returns nicely Unicoded data, but not all APIs will...
    #Don't forget to use to_unicode()
    knowledge = DBSession
    for leg in all_legs:
        print leg['full_name'], leg['state']
        if Entity.by_name(leg['full_name']):
            continue
            character = Entity.by_name(leg['full_name'])
        else:
            character = Entity(leg['full_name'])
        for key, value in leg.items():
            character[key] = to_unicode(value)
        knowledge.add(character)
        knowledge.commit()
Exemplo n.º 8
0
    def build_user(self, user_id, user_info):
        user = Entity.by_name(user_id)
        display_key = {
            'CommitCommentEvent': ('commented on', 'commits'),
            'CreateEvent': ('created', 'tags, branches, or repositories'),
            'DeleteEvent': ('deleted', 'tags, branches, or repositories'),
            'FollowEvent': ('followed', 'users'),
            'ForkEvent': ('forked', 'repositories'),
            'GistEvent': ('made or modified', 'gists'),
            'GollumEvent': ('made or modified', 'wiki pages'),
            'IssueCommentEvent': ('commented on', 'issues'),
            'IssuesEvent': ('made or modified', 'issues'),
            'PullRequestEvent': ('made or modified', 'pull requests'),
            'PullRequestReviewCommentEvent': ('commented on', 'pull requests'),
            'PushEvent': ('pushed', 'commits'),
            'WatchEvent': ('watched', 'repositories'),
        }

        text = ["{0} has been very busy this week!".format(user['name'])]
        for event_type, count in user_info.items():
            if event_type == 'count':
                continue
            elif event_type in ['CommitCommentEvent', 'FollowEvent',
                                'IssueCommentEvent', 'WatchEvent',
                                'PullRequestReviewCommentEvent',]:
                count = int(count * 10)
            display_text = display_key.get(event_type, ('made', event_type))
            text.append("{0} {1[0]} {2} {1[1]} this week."
                .format(user['name'], display_text, count, event_type))

        self.finish(user, text)
Exemplo n.º 9
0
def top_contributions():
    week_activity = recent_events(7)
    user_activity = defaultdict(lambda: defaultdict(int))
    repo_activity = defaultdict(lambda: defaultdict(int))
    for event in week_activity:
        changes = 1

        if event['type'] == 'PushEvent':
            changes = event['payload']['size']
        elif event['type'] in ['CommitCommentEvent', 'FollowEvent',
                               'IssueCommentEvent', 'WatchEvent',
                               'PullRequestReviewCommentEvent',]:
            # Social (non-coding) events carry less weight
            changes = .1

        user_activity[event['actor']]['count'] += changes
        user_activity[event['actor']][event['type']] += changes

        if Entity.by_name(event['repo']):
            if event['type'] in ['CommitCommentEvent', 'FollowEvent',
                               'IssueCommentEvent', 'WatchEvent',
                               'PullRequestReviewCommentEvent',]:
                changes = 1
            repo_activity[event['repo']]['count'] += changes
            repo_activity[event['repo']][event['actor']] += changes
    return user_activity, repo_activity
Exemplo n.º 10
0
    def build_user(self, user_id, user_info):
        user = Entity.by_name(user_id)
        display_key = {
            'CommitCommentEvent': ('commented on', 'commits'),
            'CreateEvent': ('created', 'tags, branches, or repositories'),
            'DeleteEvent': ('deleted', 'tags, branches, or repositories'),
            'FollowEvent': ('followed', 'users'),
            'ForkEvent': ('forked', 'repositories'),
            'GistEvent': ('made or modified', 'gists'),
            'GollumEvent': ('made or modified', 'wiki pages'),
            'IssueCommentEvent': ('commented on', 'issues'),
            'IssuesEvent': ('made or modified', 'issues'),
            'PullRequestEvent': ('made or modified', 'pull requests'),
            'PullRequestReviewCommentEvent': ('commented on', 'pull requests'),
            'PushEvent': ('pushed', 'commits'),
            'WatchEvent': ('watched', 'repositories'),
        }

        text = ["{0} has been very busy this week!".format(user['name'])]
        for event_type, count in user_info.items():
            if event_type == 'count':
                continue
            elif event_type in [
                    'CommitCommentEvent',
                    'FollowEvent',
                    'IssueCommentEvent',
                    'WatchEvent',
                    'PullRequestReviewCommentEvent',
            ]:
                count = int(count * 10)
            display_text = display_key.get(event_type, ('made', event_type))
            text.append("{0} {1[0]} {2} {1[1]} this week.".format(
                user['name'], display_text, count, event_type))

        self.finish(user, text)
Exemplo n.º 11
0
def top_contributions():
    week_activity = recent_events(7)
    user_activity = defaultdict(lambda: defaultdict(int))
    repo_activity = defaultdict(lambda: defaultdict(int))
    for event in week_activity:
        changes = 1

        if event['type'] == 'PushEvent':
            changes = event['payload']['size']
        elif event['type'] in [
                'CommitCommentEvent',
                'FollowEvent',
                'IssueCommentEvent',
                'WatchEvent',
                'PullRequestReviewCommentEvent',
        ]:
            # Social (non-coding) events carry less weight
            changes = .1

        user_activity[event['actor']]['count'] += changes
        user_activity[event['actor']][event['type']] += changes

        if Entity.by_name(event['repo']):
            if event['type'] in [
                    'CommitCommentEvent',
                    'FollowEvent',
                    'IssueCommentEvent',
                    'WatchEvent',
                    'PullRequestReviewCommentEvent',
            ]:
                changes = 1
            repo_activity[event['repo']]['count'] += changes
            repo_activity[event['repo']][event['actor']] += changes
    return user_activity, repo_activity
Exemplo n.º 12
0
def repo_info(repo):
    repo_name = repo.get('full_name', '{0}/{1}'.format(repo['owner']['login'],
                                                       repo['name']))
    if not Entity.by_name(repo_name):
        print("Caching new repository {0}".format(repo_name))
        entity = Entity(repo_name)
        entity['name'] = repo['full_name']
        # Evidently you cannot set facts to None. (?)
        if not repo['description']:
            entity['description'] = u''
        else:
            entity['description'] = repo['description']
        entity['url'] = repo['html_url']
        entity['owner'] = user_info(repo['owner']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(repo_name)
Exemplo n.º 13
0
def user_info(user):
    user_name = u'user_{0}'.format(user['id'])
    if not Entity.by_name(user_name):
        print("Caching new user {0}".format(user_name))
        entity = Entity(user_name)
        entity['login'] = user['login']
        entity['gravatar'] = user['gravatar_id']
        entity['avatar'] = u'http://www.gravatar.com/avatar/{0}?s=200' \
                             .format(user['gravatar_id'])
        # Not everyone has set a name for their account.
        if user.get('name'):
            entity[u'name'] = user['name']
        else:
            entity[u'name'] = user['login']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(user_name)
Exemplo n.º 14
0
def repo_info(repo):
    repo_name = repo.get(
        'full_name', '{0}/{1}'.format(repo['owner']['login'], repo['name']))
    if not Entity.by_name(repo_name):
        print("Caching new repository {0}".format(repo_name))
        entity = Entity(repo_name)
        entity['name'] = repo['full_name']
        # Evidently you cannot set facts to None. (?)
        if not repo['description']:
            entity['description'] = u''
        else:
            entity['description'] = repo['description']
        entity['url'] = repo['html_url']
        entity['owner'] = user_info(repo['owner']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(repo_name)
Exemplo n.º 15
0
def user_info(user):
    user_name = u'user_{0}'.format(user['id'])
    if not Entity.by_name(user_name):
        print("Caching new user {0}".format(user_name))
        entity = Entity(user_name)
        entity['login'] = user['login']
        entity['gravatar'] = user['gravatar_id']
        entity['avatar'] = u'http://www.gravatar.com/avatar/{0}?s=200' \
                             .format(user['gravatar_id'])
        # Not everyone has set a name for their account.
        if user.get('name'):
            entity[u'name'] = user['name']
        else:
            entity[u'name'] = user['login']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(user_name)
Exemplo n.º 16
0
def inject_knowledge():
    #SunlightAPI returns nicely Unicoded data, but not all APIs will...
    #Don't forget to use to_unicode()
    knowledge = DBSession
    for leg in ny_legs:
        print leg['full_name']
        #Check if entity is already in DB, and if so continue
        #This is good for not duplicating entries, however
        #This will skip updates to the entity :(
        #This is why Sunlight included an "updated_at" field :)
        #TODO: Change check to "updated_at" instead of full_name
        if Entity.by_name(leg['full_name']):
            continue
            character = Entity.by_name(leg['full_name'])
        else:
            character = Entity(leg['full_name'])
        for key, value in leg.items():
            character[key] = to_unicode(value)
        knowledge.add(character)
        knowledge.commit()
Exemplo n.º 17
0
def event_info(event):
    event_name = u'event_{0}'.format(event['id'])
    if not Entity.by_name(event_name):
        print("Caching new event {0}".format(event_name))
        entity = Entity(event_name)
        entity['name'] = event_name
        entity[u'actor'] = user_info(event['actor']).name
        try:
            entity[u'repo'] = repo_info(event['repo']['name']).name
        except:
            entity['repo'] = event['repo']['name']
        entity[u'type'] = event['type']
        entity[u'payload'] = event['payload']
        entity[u'created_at'] = datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ')
        if 'Comment' in event['type']:
            entity[u'comment'] = comment_info(event['payload']['comment']).name
        if 'Issue' in event['type']:
            entity['issue'] = issue_info(event['payload']['issue']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(event_name)
Exemplo n.º 18
0
def event_info(event):
    event_name = u'event_{0}'.format(event['id'])
    if not Entity.by_name(event_name):
        print("Caching new event {0}".format(event_name))
        entity = Entity(event_name)
        entity['name'] = event_name
        entity[u'actor'] = user_info(event['actor']).name
        try:
            entity[u'repo'] = repo_info(event['repo']['name']).name
        except:
            entity['repo'] = event['repo']['name']
        entity[u'type'] = event['type']
        entity[u'payload'] = event['payload']
        entity[u'created_at'] = datetime.strptime(event['created_at'],
                                                  '%Y-%m-%dT%H:%M:%SZ')
        if 'Comment' in event['type']:
            entity[u'comment'] = comment_info(event['payload']['comment']).name
        if 'Issue' in event['type']:
            entity['issue'] = issue_info(event['payload']['issue']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(event_name)
Exemplo n.º 19
0
    def zip_exe_handler(self, entity):
        """ Handles self-extracting zip files """
        self.log.debug("zip_exe_handler(%s)" % entity)
        entity[u'format'] = u'zip'
        dirname = os.path.dirname(entity[u'filename'])
        p = subprocess.Popen('unzip -o "%s"' % entity[u'filename'],
                             shell=True, stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             cwd=dirname)
        out, err = p.communicate()
        if err:
            self.log.error("Error unzipping: " + err)
        else:
            # Delete compressed data after extracting
            os.unlink(entity[u'filename'])

        for line in out.split('\n'):
            if line.strip().startswith('inflating'):
                extracted = os.path.join(dirname, line.strip().split()[-1])
                self.log.debug("extracted " + extracted)
                magic = utils.get_magic(extracted)

                # Create a new child Entity for each extracted file
                extracted = to_unicode(extracted)
                child = Entity.by_name(extracted)
                if not child:
                    child = Entity(name=os.path.basename(extracted))
                    child[u'filename'] = extracted
                    DBSession.add(child)
                    child.parent = entity
                    child[u'magic'] = to_unicode(magic)
                    self.log.debug("Created %s" % child)
                else:
                    child.parent = entity

                DBSession.flush()

                self.call_magic_handler(extracted, child)
Exemplo n.º 20
0
def cache_events(client, org):
    """Pull new events from Github."""

    try:
        members = client.organization_members(org)
    except:
        print('Error getting members')
        return

    for user in members:
        try:
            events = client.user_activity(user['login'])
            for event in events:
                if not Entity.by_name(event['repo']):
                    client.repo_information(event['repo'])
        except:
            print("Something went wrong updating the events for {0}." \
                  .format(user['login']))
            continue


    print("You have {0} of {1} calls left this hour."
          .format(*client.rate_limiting))
Exemplo n.º 21
0
    def populate(self, event):
        self.event = event
        user = Entity.by_name(event[u'actor'])
        user_name = user[u'name']
        repo = event[u'repo']

        if not Entity.by_name(repo):
            repo_link = event[u'repo']
            repo_desc = ''
        else:
            repo = Entity.by_name(repo)
            repo_link = '<a href="{0}">{1}</a>'.format(repo['url'], repo['name'])
            repo_desc = repo['description']

        self.box.pack_start(url_to_image(user[u'avatar'], user[u'gravatar'], self.scale),
                            False, False, 10)

        event_colors = {
            'commit': "#C9FFC1",
            'branch': "#C2C9FF",
            'issue': "#FFBAF9",
            'comment': "#FFDDBD",
            'social': "#FFFF80",
        }
        event_text = []
        color = "#FFFFFF"
        if event[u'type'] == "CommitCommentEvent":
            color = event_colors['comment']
            event_text.append("{0} commented on a commit in {1}."
                .format(user_name, repo_link))
            comment = Entity.by_name(event[u'comment'])
            event_text.append(comment[u'body'])
        elif event[u'type'] == "CreateEvent":
            color = event_colors['branch']
            new_type = event[u'payload']['ref_type']
            if new_type == 'repository':
                event_text.append("{0} created a new {1}, {2}."
                    .format(user_name, new_type, repo_link))
            else:
                event_text.append("{0} created {1} <tt>{3}</tt> in {2}."
                    .format(user_name, new_type, repo_link,
                            event[u'payload']['ref']))
            event_text.append(repo_desc)
        elif event[u'type'] == "DeleteEvent":
            color = event_colors['branch']
            new_type = event[u'payload']['ref_type']
            event_text.append("{0} deleted {1} <tt>{3}</tt> in {2}."
                .format(user_name, new_type, repo_link,
                        event[u'payload']['ref']))
        #DownloadEvent
        elif event[u'type'] == "FollowEvent":
            color = event_colors['social']
            target = event['payload']['target']
            try:
                event_text.append("{0} is now following {1}."
                    .format(user_name, target['name']))
            except KeyError:
                event_text.append("{0} is now following {1}."
                    .format(user_name, target['login']))
        elif event[u'type'] == "ForkEvent":
            color = event_colors['branch']
            try:
                event_text.append("{0} forked {1} to {2}."
                    .format(user_name, repo_link,
                            event[u'payload']['forkee']['full_name']))
            except KeyError:
                event_text.append("{0} forked {1} to {2}/{3}."
                    .format(user_name, repo_link,
                            event[u'payload']['forkee']['owner']['login'],
                            event[u'payload']['forkee']['name']))
            event_text.append(repo_desc)
        #ForkApplyEvent
        elif event[u'type'] == "GistEvent":
            event_text.append("{0} {1}d a gist"
                .format(user_name, event['payload']['action']))
        elif event[u'type'] == "GollumEvent":
            event_text.append("{0} updated {2} wiki pages in {1}."
                              .format(user_name, repo_link,
                                      len(event['payload']['pages'])))
            for page in event['payload']['pages']:
                event_text.append(page['title'])
        elif event[u'type'] == "IssueCommentEvent":
            color = event_colors['comment']
            issue = Entity.by_name(event['issue'])
            event_text.append("{0} commented on issue #{1} in {2}."
                .format(user_name, issue['number'], repo_link))
            comment = Entity.by_name(event[u'comment'])
            event_text.append(issue[u'title'])
            event_text.append(comment[u'body'])
        elif event[u'type'] == "IssuesEvent":
            color = event_colors['issue']
            issue = Entity.by_name(event['issue'])
            event_text.append("{0} {1} issue #{2} in {3}."
                .format(user_name, event['payload']['action'],
                        issue['number'], repo_link))
            event_text.append(issue[u'title'])
        elif event[u'type'] == "MemberEvent":
            try:
                event_text.append("{0} added {1} as a collaborator to {2}."
                    .format(user_name, event['payload']['member']['name'],
                            repo_link))
            except KeyError:
                event_text.append("{0} added {1} as a collaborator to {2}."
                    .format(user_name, event['payload']['member']['login'],
                            repo_link))
            event_text.append(repo_desc)
        elif event[u'type'] == "PublicEvent":
            event_text.append("{0} made {1} public."
                              .format(user_name, repo_link))
            event_text.append(repo_desc)
        elif event[u'type'] == "PullRequestEvent":
            color = event_colors['issue']
            # request = Entity.by_name(event['request'])
            event_text.append("{0} {1} pull request #{2} in {3}."
                .format(user_name, event['payload']['action'],
                        event['payload']['number'], repo_link))
        elif event['type'] == 'PullRequestReviewCommentEvent':
            color = event_colors['social']
            event_text.append("{0} commented on an issue in {1}."
                              .format(user_name, repo_link))
            comment = Entity.by_name(event['comment'])
            event_text.append(comment['body'])
        elif event[u'type'] == "PushEvent":
            color = event_colors['commit']
            event_text.append("{0} pushed {1} commit(s) to {2}."
                .format(user_name, event[u'payload']['size'],
                        repo_link))
            for commit in event[u'payload']['commits']:
                event_text.append(u'• ' + commit['message'])
        #TeamAddEvent
        elif event[u'type'] == "WatchEvent":
            color = event_colors['social']
            event_text.append("{0} is now watching {1}"
                .format(user_name, repo_link))
            event_text.append(repo_desc)
        else:
            event_text.append(event['type'])

        self.modify_bg(Gtk.StateType.NORMAL, Gdk.color_parse(color))
        event_label = mk_label('\n'.join(event_text))
        self.box.pack_start(event_label, False, False, 0)
Exemplo n.º 22
0
    def twill_handler(self, url, links, login_func):
        """
        This function uses twill to download the files defined in links
        and passing them to the magic handler to be processed.
        """

        self.log.debug('twill_handler(%s)' % locals())

        parsed_url = urlparse(url)
        hostname = parsed_url[1].replace('www.', '')

        parent = Entity.by_name(hostname)
        if not parent:
            parent = Entity(name=hostname)
            DBSession.add(parent)
            root = Entity.by_name(u'CIVX')
            if not root:
                root = Entity(name=u'CIVX')
                DBSession.add(root)
            parent.parent = root

        # See if this entity already exists
        entity = Entity.by_name(url)
        if entity:
            self.log.info('Entity(%r) already exists; skipping.' % url)
            return

        #DBSession.flush()
        for category, link_list in links.items():

            dest = [self.config['git_dir'], hostname]

            if len(links) == 1:
                entity = parent
            else:
                entity = Entity(name=category)
                entity[u'url'] = url
                entity[u'repo'] = hostname
                entity.parent = parent
                dest.append(entity.name)

                DBSession.add(entity)
                DBSession.flush()

            dest = os.path.join(*dest)
            if not os.path.isdir(dest):
                os.makedirs(dest)

            b = self.get_browser()
            for link in link_list:
                # We might have timed out, try to log in again.
                login_func()
                b.go(link['href'])
                # Try to pick out the filename if there is a query
                if link['href'].find('=') >= 0:
                    filename = urlparse(link['href'])[-2].split('=')[1]
                else:
                    filename = link['href'].split('/')[-1]
                filename = os.path.join(dest, filename)
                save_html(filename)

                file_entity = Entity(name=link.contents[0])
                file_entity[u'filename'] = filename
                file_entity[u'repo'] = hostname
                DBSession.add(file_entity)
                file_entity.parent = entity
                self.log.debug("Created entity %r (parent %r)" % (
                    file_entity.name, file_entity.parent.name))

                magic = self.call_magic_handler(filename, file_entity)
                file_entity[u'magic'] = magic

                DBSession.flush()
Exemplo n.º 23
0
    def populate(self, event):
        self.event = event
        user = Entity.by_name(event[u'actor'])
        user_name = user[u'name']
        repo = event[u'repo']

        if not Entity.by_name(repo):
            repo_link = event[u'repo']
            repo_desc = ''
        else:
            repo = Entity.by_name(repo)
            repo_link = '<a href="{0}">{1}</a>'.format(repo['url'],
                                                       repo['name'])
            repo_desc = repo['description']

        self.box.pack_start(
            url_to_image(user[u'avatar'], user[u'gravatar'], self.scale),
            False, False, 10)

        event_colors = {
            'commit': "#C9FFC1",
            'branch': "#C2C9FF",
            'issue': "#FFBAF9",
            'comment': "#FFDDBD",
            'social': "#FFFF80",
        }
        event_text = []
        color = "#FFFFFF"
        if event[u'type'] == "CommitCommentEvent":
            color = event_colors['comment']
            event_text.append("{0} commented on a commit in {1}.".format(
                user_name, repo_link))
            comment = Entity.by_name(event[u'comment'])
            event_text.append(comment[u'body'])
        elif event[u'type'] == "CreateEvent":
            color = event_colors['branch']
            new_type = event[u'payload']['ref_type']
            if new_type == 'repository':
                event_text.append("{0} created a new {1}, {2}.".format(
                    user_name, new_type, repo_link))
            else:
                event_text.append(
                    "{0} created {1} <tt>{3}</tt> in {2}.".format(
                        user_name, new_type, repo_link,
                        event[u'payload']['ref']))
            event_text.append(repo_desc)
        elif event[u'type'] == "DeleteEvent":
            color = event_colors['branch']
            new_type = event[u'payload']['ref_type']
            event_text.append("{0} deleted {1} <tt>{3}</tt> in {2}.".format(
                user_name, new_type, repo_link, event[u'payload']['ref']))
        #DownloadEvent
        elif event[u'type'] == "FollowEvent":
            color = event_colors['social']
            target = event['payload']['target']
            try:
                event_text.append("{0} is now following {1}.".format(
                    user_name, target['name']))
            except KeyError:
                event_text.append("{0} is now following {1}.".format(
                    user_name, target['login']))
        elif event[u'type'] == "ForkEvent":
            color = event_colors['branch']
            try:
                event_text.append("{0} forked {1} to {2}.".format(
                    user_name, repo_link,
                    event[u'payload']['forkee']['full_name']))
            except KeyError:
                event_text.append("{0} forked {1} to {2}/{3}.".format(
                    user_name, repo_link,
                    event[u'payload']['forkee']['owner']['login'],
                    event[u'payload']['forkee']['name']))
            event_text.append(repo_desc)
        #ForkApplyEvent
        elif event[u'type'] == "GistEvent":
            event_text.append("{0} {1}d a gist".format(
                user_name, event['payload']['action']))
        elif event[u'type'] == "GollumEvent":
            event_text.append("{0} updated {2} wiki pages in {1}.".format(
                user_name, repo_link, len(event['payload']['pages'])))
            for page in event['payload']['pages']:
                event_text.append(page['title'])
        elif event[u'type'] == "IssueCommentEvent":
            color = event_colors['comment']
            issue = Entity.by_name(event['issue'])
            event_text.append("{0} commented on issue #{1} in {2}.".format(
                user_name, issue['number'], repo_link))
            comment = Entity.by_name(event[u'comment'])
            event_text.append(issue[u'title'])
            event_text.append(comment[u'body'])
        elif event[u'type'] == "IssuesEvent":
            color = event_colors['issue']
            issue = Entity.by_name(event['issue'])
            event_text.append("{0} {1} issue #{2} in {3}.".format(
                user_name, event['payload']['action'], issue['number'],
                repo_link))
            event_text.append(issue[u'title'])
        elif event[u'type'] == "MemberEvent":
            try:
                event_text.append(
                    "{0} added {1} as a collaborator to {2}.".format(
                        user_name, event['payload']['member']['name'],
                        repo_link))
            except KeyError:
                event_text.append(
                    "{0} added {1} as a collaborator to {2}.".format(
                        user_name, event['payload']['member']['login'],
                        repo_link))
            event_text.append(repo_desc)
        elif event[u'type'] == "PublicEvent":
            event_text.append("{0} made {1} public.".format(
                user_name, repo_link))
            event_text.append(repo_desc)
        elif event[u'type'] == "PullRequestEvent":
            color = event_colors['issue']
            # request = Entity.by_name(event['request'])
            event_text.append("{0} {1} pull request #{2} in {3}.".format(
                user_name, event['payload']['action'],
                event['payload']['number'], repo_link))
        elif event['type'] == 'PullRequestReviewCommentEvent':
            color = event_colors['social']
            event_text.append("{0} commented on an issue in {1}.".format(
                user_name, repo_link))
            comment = Entity.by_name(event['comment'])
            event_text.append(comment['body'])
        elif event[u'type'] == "PushEvent":
            color = event_colors['commit']
            event_text.append("{0} pushed {1} commit(s) to {2}.".format(
                user_name, event[u'payload']['size'], repo_link))
            for commit in event[u'payload']['commits']:
                event_text.append(u'• ' + commit['message'])
        #TeamAddEvent
        elif event[u'type'] == "WatchEvent":
            color = event_colors['social']
            event_text.append("{0} is now watching {1}".format(
                user_name, repo_link))
            event_text.append(repo_desc)
        else:
            event_text.append(event['type'])

        self.modify_bg(Gtk.StateType.NORMAL, Gdk.color_parse(color))
        event_label = mk_label('\n'.join(event_text))
        self.box.pack_start(event_label, False, False, 0)
Exemplo n.º 24
0
 def setUp(self):
     engine = create_engine('sqlite:///:memory:')
     init_model(engine)
     metadata.create_all(engine)
     PolyScraper().consume('http://www.data.gov/raw/994')
     self.entity = Entity.by_name('data.gov')
Exemplo n.º 25
0
    def consume(self, url):
        """
        This method attempts to scrape a URI.  First it tries to figure out the
        protocol, then tries to pull a hostname out of the url.  Then the git
        repo is initialized, and we take a close look at the url.

        If the hostnme is known to be tricky, it will have a special handler
        method written for it and leave from there.  Otherwise it goes through
        the general path for its protocol, attempting to find useful data.

        When everything is done, the entity is updated and messages are sent
        out announcing that the scrape is done.
        """
        self.log.debug("PolyScraper(%s)" % url)
        start = datetime.utcnow()

        # Try to pull a protocol off the URI
        protocol_end = url.find("://")
        protocol = "http"
        if not protocol_end == -1:
            protocol = url[:protocol_end]

        parsed_url = urlparse(url)
        hostname = parsed_url[1].replace('www.', '')
        # Set a hostname if none is set.
        if not hostname:
            hostname = u"localhost"

        # See if we already know about this URL
        entity = Entity.by_name(url)
        if entity:
            self.log.info('Entity(%r) already exists' % url)
        else:
            root = Entity.by_name(u'CIVX')
            if not root:
                root = Entity(name=u'CIVX')
                DBSession.add(root)
                DBSession.flush()

            parent = Entity.by_name(hostname)
            if not parent:
                parent = Entity(name=hostname)
                DBSession.add(parent)
                parent.parent = root
                self.log.debug("Created entity %r" % parent.name)

            entity = Entity(name=url)
            DBSession.add(entity)
            # hide the exact url entity from our tree
            entity.parent = parent
            self.log.debug("Created entity %r" % entity.name)

            #self.send_message('civx.knowledge.entities.new', {
            #    'msg': 'New entity created: %s' % url
            #    })

        DBSession.flush()

        # Initialize a git repo for this data source
        entity[u'repo'] = hostname
        #entity[u'url'] = url

        # Initialize the git repository for this domain
        #~ self.init_git_repo(repo=hostname)
        DBSession.flush()

        # Scrape the url (to a certain depth) for data
        num_downloads = 0

        # Provide a URL handler method that is called with each file pass
        # in the soup entity for the link instead, so we can easily look
        # around the DOM and pull out titles, etc.
        if hostname in self.url_handlers:
            #self.url_handlers[hostname](self, soup_link, file_entity)
            self.url_handlers[hostname](self, url)
        else:
            # If we do not specifically handle this file, take a basic approach
            # based on the protocol.  These could probably also be split off
            # into $protocol_handler methods.
            self.log.warning('Cannot find %s URL handler' % hostname)
            files = []
            if protocol == "ftp":
                from ftplib import FTP
                self.log.debug("FTP support is not implemented yet.")
            elif protocol == "file":
                search_path = url[protocol_end+3:]
                local_files = []
                if os.path.isdir(search_path):
                    # Find all files in directory
                    for directory in os.walk(search_path):
                        dirpath = directory[0]
                        for filename in directory[2]:
                            local_files.append(os.path.join(dirpath, filename))
                else:
                    local_files.append(search_path)

                dest = os.path.join(self.config['git_dir'], hostname)

                # FIXME: what about for links to epa.gov from data.gov?
                # we probably want our own epa.gov repo namespace to download
                # and extract this to
                #if not os.path.isdir(dest):
                #    self.log.debug("mkdir %s" % dest)
                #    os.makedirs(dest)

                # I think this section is deprecated and unnecessary...
                #for ext in extensions.split(','):
                ##    if link.endswith('%s' % ext) or '/%s/' % ext in link:
                ##        entity[u'format'] = ext
                #    if ext not in civx.model.models[Entity]:
                #        civx.model.models[Entity][ext] = []

                for path in local_files:
                    #raw = self.download_file(link)
                    #file_name = os.path.basename(link)
                    #filename = to_unicode(os.path.join(dest, file_name))
                    #num_downloads += 1

                    #shutil.copy(raw, filename)
                    #self.log.debug("Copied %s to %s" % (raw,
                    #    os.path.join(dest, file_name)))

                    ##file_entity = Entity(name=os.path.basename(file_name))
                    #file_entity = Entity(name=link)
                    ##file_entity[u'url'] = link
                    #file_entity[u'filename'] = filename
                    #file_entity[u'repo'] = hostname
                    #DBSession.add(file_entity)
                    #file_entity.parent = entity
                    ##file_entity.parent = parent
                    #self.log.debug("Created entity %r (parent %r)" % (
                    #    file_entity.name, file_entity.parent.name))

                    file_path = os.path.split(path)[0]
                    file_name = os.path.split(path)[1]
                    self.log.info("%s is a local file" % file_name)
                    files.append((file_path, file_name, path))
                    #files.append((os.path.dirname(filename), file_name, filename))

            else:
                # Assume protocol is http
                """
####
            f = urllib2.urlopen(url) # XXX: does this load everything into mem?
            if f.info().type == 'text/html':
                soup = self.get_soup(f.read())
            else: # Assume the url is a link to a direct file
                # Save the file to disk.
                # throw file at magic handlers
####
"""
                soup = self.get_soup(url)

                for link, soup_link in self.scrape_files_from_url(url, soup_links=True):
                    parsed_link = urlparse(link)
                    file_path = '/'.join(parsed_link[2].split('/')[:-1])
                    file_name = parsed_link[2].split('/')[-1]
                    files.append((file_path, file_name, link))

            for (file_path, file_name, link) in files:
                dest = self.config['git_dir'] + hostname + file_path
                local = os.path.exists(link)

                # See if this file already exists
                file_entity = Entity.by_name(link)
                #file_entity = Entity.by_name(os.path.basename(file_name))
                if file_entity:
                    self.log.info('Entity(%r) already exists; skipping.' % link)
                    continue

                # FIXME: what about for links to epa.gov from data.gov?
                # we probably want our own epa.gov repo namespace to download
                # and extract this to
                if not os.path.isdir(dest):
                    os.makedirs(dest)

                # I think this section is deprecated and unnecessary...
                for ext in extensions.split(','):
                #    if link.endswith('%s' % ext) or '/%s/' % ext in link:
                #        entity[u'format'] = ext
                    if ext not in civx.model.models[Entity]:
                        civx.model.models[Entity][ext] = []

                raw = self.download_file(link)
                filename = os.path.join(dest, file_name)
                num_downloads += 1

                if local:
                    self.log.debug("Copied %s to %s" % (raw, filename))
                    shutil.copy(raw, filename)
                else:
                    self.log.debug("Moved %s to %s" % (raw, filename))
                    shutil.move(raw, filename)

                #file_entity = Entity(name=os.path.basename(file_name))
                file_entity = Entity(name=link)
                #file_entity[u'url'] = link
                file_entity[u'filename'] = filename
                file_entity[u'repo'] = hostname
                DBSession.add(file_entity)
                file_entity.parent = entity
                #file_entity.parent = parent
                self.log.debug("Created entity %r (parent %r)" % (
                    file_entity.name, file_entity.parent.name))

                # Determine the file magic, and call the appropriate handler
                file_entity[u'magic'] = self.call_magic_handler(filename, file_entity)
                DBSession.flush()

# To do this stuff we'll need to return an entity from the url handler?
        #if 'num_files' in entity.facts:
        #    num_files = int(entity['num_files'])
        #    print repr(num_files)
#
#            if num_files != num_downloads:
#                self.log.info('Downloaded %d more files from previous scrape' %
#                              num_downloads - num_files)
#            entity[u'num_files'] += num_downloads
#        else:
        #entity[u'num_files'] = num_downloads
        #if u'date_added' not in entity.facts:
        #    entity[u'date_added'] = unicode(datetime.utcnow())
        #entity[u'date_last_scraped'] = unicode(datetime.utcnow())

        if 'changelog' not in entity.facts:
            entity[u'changelog'] = []

        finish = datetime.utcnow()

        changelog = {
            u'start_time': unicode(start),
            u'finish_time': unicode(finish),
            u'elapsed_time': unicode(finish-start),
            u'num_downloads': num_downloads,
            #u'num_children': len(entity.children),
            #~ u'git_commit': self.get_latest_commit_id(),
            }
        entity[u'changelog'].append(changelog)

        DBSession.commit()

        self.log.info("== Statistics ==")
        self.log.info("Scraped url: " + url)
        self.log.info("Number of downloaded files: %d" % num_downloads)
Exemplo n.º 26
0
 def test_associating_facts_unicode_by_get(self):
     apple = Entity('apple')
     DBSession.add(apple)
     DBSession.commit()
     apple['foo'] = u'bar'
     eq_(apple.get('foo'), 'bar')
Exemplo n.º 27
0
 def test_associating_facts_get_custom_default(self):
     apple = Entity('apple')
     DBSession.add(apple)
     DBSession.commit()
     apple['foo'] = u'bar'
     eq_(apple.get('baz', 'zomg'), 'zomg')
Exemplo n.º 28
0
    def data_gov_handler(self, url):
        """ data.gov handler.

        Entity(CIVX)
        |-Entity(data.gov)
        | |-Entity(Agency)
        |    |-Entity(http://www.data.gov/raw/674)
        |    | |-Fact(title), ...
        |    | |-Entity(http://www.epa.gov/tri/tridata/tri08/early_data/statedata/basic/TRI_2008_TN_v08.exe)

        """
        self.log.debug('data_gov_handler(%s)' % locals())
        parsed_url = urlparse(url)
        hostname = parsed_url[1].replace('www.', '')
        data_types = ('csv', 'RDF', 'xml', 'kml', 'PDF', 'shapefile', 'XLS')
        fields = ('Agency', 'Sub-Agency', 'Category', 'Date Released',
                  'Date Updated', 'Time Period', 'Frequency',
                  'Description', 'Data.gov Data Category Type',
                  'Specialized Data Category Designation',
                  'Keywords', 'Unique ID', 'Citation',
                  'Agency Program Page', 'Agency Data Series Page',
                  'Unit of Analysis', 'Granularity', 'Geographic Coverage',
                  'Collection Mode', 'Data Collection Instrument',
                  'Data Dictionary/Variable List', 'Technical Documentation',
                  'Additional Metadata')

        # Our top-level data.gov entity
        data_gov = Entity.by_name('data.gov')
        if not data_gov:
            data_gov = Entity(name=u'data.gov')
            DBSession.add(data_gov)
            root = Entity.by_name(u'CIVX')
            if not root:
                root = Entity(name=u'CIVX')
                DBSession.add(root)
            data_gov.parent = root

        # See if this entity already exists
        #~ entity = Entity.by_name(url)
        #~ if entity:
            #~ self.log.info('Entity(%r) already exists; skipping.' % url)
            #~ return

        soup = self.get_soup(url)

        # If this is a raw data profile, grab the title of the dataset
        if '/raw/' in url:
            # Create a new Entity for this URL
            title = soup.find('h2', {'id': 'datasetName'}).string.decode('utf-8', 'replace')
            entity = Entity(name=title)
            entity[u'url'] = url
            entity[u'repo'] = hostname
            DBSession.add(entity)
            dest = [self.config['git_dir'], hostname]

            # Extract data for each field
            for field in fields:
                data = soup.find(text=field)
                if data and data.next and data.next.next:
                    data = data.next.next.string
                    if data:
                        entity[unicode(field)] = data.decode('utf-8').strip()

            DBSession.flush()

            # Create seperate parent Agency Entity
            if u'Agency' in entity.facts:
                agency = Entity(name=entity[u'Agency'])
                agency.parent = data_gov
                parent = agency
                DBSession.add(agency)
                dest.append(entity[u'Agency'])
                if u'Sub-Agency' in entity.facts:
                    subagency = Entity(name=entity[u'Sub-Agency'])
                    subagency.parent = agency
                    parent = subagency
                    DBSession.add(subagency)
                    dest.append(entity[u'Sub-Agency'])
                DBSession.flush()

            # Have the URL be the child of the agency or sub-agency
            entity.parent = parent

            # Elegant repo paths: data.gov/Agency[/Sub-Agency]/title/filename
            dest.append(entity.name)
            dest = os.path.join(*dest)
            if not os.path.isdir(dest):
                os.makedirs(dest)

            # Scrape all available raw data types
            downloads = soup.find_all('a', href=re.compile(r'^/download'))
            for button in downloads:
                data = button.string.split()[0]
                link = button['href']
                if link:
                    link = urljoin('http://explore.data.gov', link)
                    entity[data.lower()] = link
                    parsed_link = urlparse(link)
                    file_name = parsed_link[2].split('/')[-1]

                    raw = self.download_file(link)
                    filename = os.path.join(dest, file_name)
                    shutil.move(raw, filename)
                    self.log.debug("Moved %s to %s" % (raw, filename))

                    # Create a new entity for this file
                    file_entity = Entity(name=link)
                    DBSession.add(file_entity)
                    file_entity[u'filename'] = filename
                    file_entity.parent = entity

                    # Process this file accordingly
                    self.call_magic_handler(filename, file_entity)

            # Find external map links
            map = soup.find('a', href=re.compile(r'^/externallink/map/'))
            if map:
                map = urllib.unquote(map.get('href', '')[18:]).split('/')[0].replace('###', '/')
                entity[u'map'] = map

            DBSession.flush()

        # If this is from a table of results, grab the title fom this row
        else:
            self.log.debug("entity[url] = %r" % entity[u'url'])
            raise NotImplementedError("Scraping titles from data.gov tables not yet supported")