示例#1
0
文件: data.py 项目: FOSSRIT/infoboard
def comment_info(comment):
    comment_name = u'comment_{0}'.format(comment['id'])
    if not Entity.by_name(comment_name):
        print("Caching new comment {0}".format(comment_name))
        entity = Entity(comment_name)
        entity[u'body'] = comment['body']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(comment_name)
示例#2
0
def comment_info(comment):
    comment_name = u'comment_{0}'.format(comment['id'])
    if not Entity.by_name(comment_name):
        print("Caching new comment {0}".format(comment_name))
        entity = Entity(comment_name)
        entity[u'body'] = comment['body']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(comment_name)
示例#3
0
文件: data.py 项目: FOSSRIT/infoboard
def issue_info(issue):
    issue_name = u'issue_{0}'.format(issue['id'])
    if not Entity.by_name(issue_name):
        print("Caching new issue {0}".format(issue_name))
        entity = Entity(issue_name)
        entity[u'title'] = issue['title']
        entity[u'number'] = issue['number']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(issue_name)
示例#4
0
def issue_info(issue):
    issue_name = u'issue_{0}'.format(issue['id'])
    if not Entity.by_name(issue_name):
        print("Caching new issue {0}".format(issue_name))
        entity = Entity(issue_name)
        entity[u'title'] = issue['title']
        entity[u'number'] = issue['number']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(issue_name)
示例#5
0
文件: data.py 项目: FOSSRIT/infoboard
def recent_events(days=0, limit=0):
    DBSession.commit()
    events = DBSession.query(Entity) \
                      .filter(Entity.name.startswith('event\_', escape='\\')) \
                      .all()
    if days > 0:
        yesterday = datetime.now() - timedelta(days=days)
        events = filter(lambda event: event['created_at'] > yesterday, events)
    events.sort(key=lambda event: event['created_at'], reverse=True)
    if len(events) > limit > 0:
        events = events[:limit]
    return events
示例#6
0
def recent_events(days=0, limit=0):
    DBSession.commit()
    events = DBSession.query(Entity) \
                      .filter(Entity.name.startswith('event\_', escape='\\')) \
                      .all()
    if days > 0:
        yesterday = datetime.now() - timedelta(days=days)
        events = filter(lambda event: event['created_at'] > yesterday, events)
    events.sort(key=lambda event: event['created_at'], reverse=True)
    if len(events) > limit > 0:
        events = events[:limit]
    return events
示例#7
0
文件: ny.py 项目: decause/prowlitics
def inject_test_knowledge():
    monster = Entity(u'Monster')
    fairy = Entity(u'Fairy')
    rjbean = Entity(u'rjbean')
    monster[u'color'] = u'Green'
    monster[u'name'] = u'Lotharrr'
    fairy[u'flies'] = True
    fairy[u'name'] = u'Bell'
    rjbean[u'name'] = u'ralph'
    rjbean[u'flies'] = False
    rjbean[u'hacks'] = True

    DBSession.add(monster)
    DBSession.add(fairy)
    DBSession.add(rjbean)
    DBSession.commit()
示例#8
0
文件: data.py 项目: FOSSRIT/infoboard
def user_info(user):
    user_name = u'user_{0}'.format(user['id'])
    if not Entity.by_name(user_name):
        print("Caching new user {0}".format(user_name))
        entity = Entity(user_name)
        entity['login'] = user['login']
        entity['gravatar'] = user['gravatar_id']
        entity['avatar'] = u'http://www.gravatar.com/avatar/{0}?s=200' \
                             .format(user['gravatar_id'])
        # Not everyone has set a name for their account.
        if user.get('name'):
            entity[u'name'] = user['name']
        else:
            entity[u'name'] = user['login']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(user_name)
示例#9
0
def user_info(user):
    user_name = u'user_{0}'.format(user['id'])
    if not Entity.by_name(user_name):
        print("Caching new user {0}".format(user_name))
        entity = Entity(user_name)
        entity['login'] = user['login']
        entity['gravatar'] = user['gravatar_id']
        entity['avatar'] = u'http://www.gravatar.com/avatar/{0}?s=200' \
                             .format(user['gravatar_id'])
        # Not everyone has set a name for their account.
        if user.get('name'):
            entity[u'name'] = user['name']
        else:
            entity[u'name'] = user['login']
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(user_name)
示例#10
0
def repo_info(repo):
    repo_name = repo.get(
        'full_name', '{0}/{1}'.format(repo['owner']['login'], repo['name']))
    if not Entity.by_name(repo_name):
        print("Caching new repository {0}".format(repo_name))
        entity = Entity(repo_name)
        entity['name'] = repo['full_name']
        # Evidently you cannot set facts to None. (?)
        if not repo['description']:
            entity['description'] = u''
        else:
            entity['description'] = repo['description']
        entity['url'] = repo['html_url']
        entity['owner'] = user_info(repo['owner']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(repo_name)
示例#11
0
文件: data.py 项目: FOSSRIT/infoboard
def repo_info(repo):
    repo_name = repo.get('full_name', '{0}/{1}'.format(repo['owner']['login'],
                                                       repo['name']))
    if not Entity.by_name(repo_name):
        print("Caching new repository {0}".format(repo_name))
        entity = Entity(repo_name)
        entity['name'] = repo['full_name']
        # Evidently you cannot set facts to None. (?)
        if not repo['description']:
            entity['description'] = u''
        else:
            entity['description'] = repo['description']
        entity['url'] = repo['html_url']
        entity['owner'] = user_info(repo['owner']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(repo_name)
示例#12
0
文件: data.py 项目: FOSSRIT/infoboard
def event_info(event):
    event_name = u'event_{0}'.format(event['id'])
    if not Entity.by_name(event_name):
        print("Caching new event {0}".format(event_name))
        entity = Entity(event_name)
        entity['name'] = event_name
        entity[u'actor'] = user_info(event['actor']).name
        try:
            entity[u'repo'] = repo_info(event['repo']['name']).name
        except:
            entity['repo'] = event['repo']['name']
        entity[u'type'] = event['type']
        entity[u'payload'] = event['payload']
        entity[u'created_at'] = datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ')
        if 'Comment' in event['type']:
            entity[u'comment'] = comment_info(event['payload']['comment']).name
        if 'Issue' in event['type']:
            entity['issue'] = issue_info(event['payload']['issue']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(event_name)
示例#13
0
def event_info(event):
    event_name = u'event_{0}'.format(event['id'])
    if not Entity.by_name(event_name):
        print("Caching new event {0}".format(event_name))
        entity = Entity(event_name)
        entity['name'] = event_name
        entity[u'actor'] = user_info(event['actor']).name
        try:
            entity[u'repo'] = repo_info(event['repo']['name']).name
        except:
            entity['repo'] = event['repo']['name']
        entity[u'type'] = event['type']
        entity[u'payload'] = event['payload']
        entity[u'created_at'] = datetime.strptime(event['created_at'],
                                                  '%Y-%m-%dT%H:%M:%SZ')
        if 'Comment' in event['type']:
            entity[u'comment'] = comment_info(event['payload']['comment']).name
        if 'Issue' in event['type']:
            entity['issue'] = issue_info(event['payload']['issue']).name
        DBSession.add(entity)
        DBSession.commit()
    return Entity.by_name(event_name)
示例#14
0
    def polymorphic_csv_populator(self, entity):
        """
        Reads the CSV into Knowledge.
        TODO: no dynamic dialects?
        """
        try:
            #flush_after = asint(config.get('transaction_size', 1000))
            repo = utils.get_fact_from_parents(u'repo', entity)
            custom_dialect = self.dialects.get(repo, None)
            if not custom_dialect:
                # Nothing to see hre, carry on.
                pass
            elif custom_dialect not in csv.list_dialects():
                self.log.error("Dialect '%s' not found!" % custom_dialect)
            csv_reader = csv.reader(entity['filename'], dialect=custom_dialect)

            columns = None
            # TODO: See if this file has already been parsed!

            for i, line in enumerate(csv_reader):
                if i == 0:
                    columns = line

                    # then create a Table object with the appropriate columns
                    table_name = u'civx_' + unicode(uuid.uuid4()).replace('-', '')
                    entity[u'table_name'] = table_name
                    entity[u'column_names'] = columns
                    # The actual column names behind the scenes.  CIVX will
                    # map them to the 'column_names'
                    entity[u'columns'] = [u'col_%d' % i for i in
                                          range(len(columns))]
                    #DBSession.flush()
                    table, model = utils.get_mapped_table_model_from_entity(entity)
                    model.__table__ = table

                    civx.model.models[model] = {
                            'csv': [entity[u'filename']],
                            'columns': entity[u'columns'],
                            'tmp_csv': {}
                            }

                    metadata.create_all()
                    continue
                break

            self.log.info("%d entries in %r table" % (
                    DBSession.query(model).count(),
                    entity[u'table_name']))

            populate_csv((
                    utils.get_fact_from_parents('repo', entity),
                    entity[u'filename'],
                    model,
                    self.engine), dialect=custom_dialect)

            self.log.info("%d entries in %r table" % (
                    DBSession.query(model).count(),
                    entity[u'table_name']))

            DBSession.commit()

        except Exception, e:
            self.log.error('Unable to parse file as CSV')
            self.log.exception(e)
示例#15
0
 def test_basic_one(self):
     """ Basic usage. """
     apple = Entity('apple')
     DBSession.add(apple)
     DBSession.commit()
     eq_(apple.name, 'apple')
示例#16
0
 def test_associating_facts_get_custom_default(self):
     apple = Entity('apple')
     DBSession.add(apple)
     DBSession.commit()
     apple['foo'] = u'bar'
     eq_(apple.get('baz', 'zomg'), 'zomg')
示例#17
0
 def test_associating_facts_unicode_by_attr(self):
     apple = Entity('apple')
     DBSession.add(apple)
     DBSession.commit()
     apple['foo'] = u'bar'
     eq_(apple.foo, 'bar')
示例#18
0
 def test_basic_two(self):
     apple = Entity('apple')
     eq_(apple.name, 'apple')
     DBSession.commit()
示例#19
0
    def consume(self, url):
        """
        This method attempts to scrape a URI.  First it tries to figure out the
        protocol, then tries to pull a hostname out of the url.  Then the git
        repo is initialized, and we take a close look at the url.

        If the hostnme is known to be tricky, it will have a special handler
        method written for it and leave from there.  Otherwise it goes through
        the general path for its protocol, attempting to find useful data.

        When everything is done, the entity is updated and messages are sent
        out announcing that the scrape is done.
        """
        self.log.debug("PolyScraper(%s)" % url)
        start = datetime.utcnow()

        # Try to pull a protocol off the URI
        protocol_end = url.find("://")
        protocol = "http"
        if not protocol_end == -1:
            protocol = url[:protocol_end]

        parsed_url = urlparse(url)
        hostname = parsed_url[1].replace('www.', '')
        # Set a hostname if none is set.
        if not hostname:
            hostname = u"localhost"

        # See if we already know about this URL
        entity = Entity.by_name(url)
        if entity:
            self.log.info('Entity(%r) already exists' % url)
        else:
            root = Entity.by_name(u'CIVX')
            if not root:
                root = Entity(name=u'CIVX')
                DBSession.add(root)
                DBSession.flush()

            parent = Entity.by_name(hostname)
            if not parent:
                parent = Entity(name=hostname)
                DBSession.add(parent)
                parent.parent = root
                self.log.debug("Created entity %r" % parent.name)

            entity = Entity(name=url)
            DBSession.add(entity)
            # hide the exact url entity from our tree
            entity.parent = parent
            self.log.debug("Created entity %r" % entity.name)

            #self.send_message('civx.knowledge.entities.new', {
            #    'msg': 'New entity created: %s' % url
            #    })

        DBSession.flush()

        # Initialize a git repo for this data source
        entity[u'repo'] = hostname
        #entity[u'url'] = url

        # Initialize the git repository for this domain
        #~ self.init_git_repo(repo=hostname)
        DBSession.flush()

        # Scrape the url (to a certain depth) for data
        num_downloads = 0

        # Provide a URL handler method that is called with each file pass
        # in the soup entity for the link instead, so we can easily look
        # around the DOM and pull out titles, etc.
        if hostname in self.url_handlers:
            #self.url_handlers[hostname](self, soup_link, file_entity)
            self.url_handlers[hostname](self, url)
        else:
            # If we do not specifically handle this file, take a basic approach
            # based on the protocol.  These could probably also be split off
            # into $protocol_handler methods.
            self.log.warning('Cannot find %s URL handler' % hostname)
            files = []
            if protocol == "ftp":
                from ftplib import FTP
                self.log.debug("FTP support is not implemented yet.")
            elif protocol == "file":
                search_path = url[protocol_end+3:]
                local_files = []
                if os.path.isdir(search_path):
                    # Find all files in directory
                    for directory in os.walk(search_path):
                        dirpath = directory[0]
                        for filename in directory[2]:
                            local_files.append(os.path.join(dirpath, filename))
                else:
                    local_files.append(search_path)

                dest = os.path.join(self.config['git_dir'], hostname)

                # FIXME: what about for links to epa.gov from data.gov?
                # we probably want our own epa.gov repo namespace to download
                # and extract this to
                #if not os.path.isdir(dest):
                #    self.log.debug("mkdir %s" % dest)
                #    os.makedirs(dest)

                # I think this section is deprecated and unnecessary...
                #for ext in extensions.split(','):
                ##    if link.endswith('%s' % ext) or '/%s/' % ext in link:
                ##        entity[u'format'] = ext
                #    if ext not in civx.model.models[Entity]:
                #        civx.model.models[Entity][ext] = []

                for path in local_files:
                    #raw = self.download_file(link)
                    #file_name = os.path.basename(link)
                    #filename = to_unicode(os.path.join(dest, file_name))
                    #num_downloads += 1

                    #shutil.copy(raw, filename)
                    #self.log.debug("Copied %s to %s" % (raw,
                    #    os.path.join(dest, file_name)))

                    ##file_entity = Entity(name=os.path.basename(file_name))
                    #file_entity = Entity(name=link)
                    ##file_entity[u'url'] = link
                    #file_entity[u'filename'] = filename
                    #file_entity[u'repo'] = hostname
                    #DBSession.add(file_entity)
                    #file_entity.parent = entity
                    ##file_entity.parent = parent
                    #self.log.debug("Created entity %r (parent %r)" % (
                    #    file_entity.name, file_entity.parent.name))

                    file_path = os.path.split(path)[0]
                    file_name = os.path.split(path)[1]
                    self.log.info("%s is a local file" % file_name)
                    files.append((file_path, file_name, path))
                    #files.append((os.path.dirname(filename), file_name, filename))

            else:
                # Assume protocol is http
                """
####
            f = urllib2.urlopen(url) # XXX: does this load everything into mem?
            if f.info().type == 'text/html':
                soup = self.get_soup(f.read())
            else: # Assume the url is a link to a direct file
                # Save the file to disk.
                # throw file at magic handlers
####
"""
                soup = self.get_soup(url)

                for link, soup_link in self.scrape_files_from_url(url, soup_links=True):
                    parsed_link = urlparse(link)
                    file_path = '/'.join(parsed_link[2].split('/')[:-1])
                    file_name = parsed_link[2].split('/')[-1]
                    files.append((file_path, file_name, link))

            for (file_path, file_name, link) in files:
                dest = self.config['git_dir'] + hostname + file_path
                local = os.path.exists(link)

                # See if this file already exists
                file_entity = Entity.by_name(link)
                #file_entity = Entity.by_name(os.path.basename(file_name))
                if file_entity:
                    self.log.info('Entity(%r) already exists; skipping.' % link)
                    continue

                # FIXME: what about for links to epa.gov from data.gov?
                # we probably want our own epa.gov repo namespace to download
                # and extract this to
                if not os.path.isdir(dest):
                    os.makedirs(dest)

                # I think this section is deprecated and unnecessary...
                for ext in extensions.split(','):
                #    if link.endswith('%s' % ext) or '/%s/' % ext in link:
                #        entity[u'format'] = ext
                    if ext not in civx.model.models[Entity]:
                        civx.model.models[Entity][ext] = []

                raw = self.download_file(link)
                filename = os.path.join(dest, file_name)
                num_downloads += 1

                if local:
                    self.log.debug("Copied %s to %s" % (raw, filename))
                    shutil.copy(raw, filename)
                else:
                    self.log.debug("Moved %s to %s" % (raw, filename))
                    shutil.move(raw, filename)

                #file_entity = Entity(name=os.path.basename(file_name))
                file_entity = Entity(name=link)
                #file_entity[u'url'] = link
                file_entity[u'filename'] = filename
                file_entity[u'repo'] = hostname
                DBSession.add(file_entity)
                file_entity.parent = entity
                #file_entity.parent = parent
                self.log.debug("Created entity %r (parent %r)" % (
                    file_entity.name, file_entity.parent.name))

                # Determine the file magic, and call the appropriate handler
                file_entity[u'magic'] = self.call_magic_handler(filename, file_entity)
                DBSession.flush()

# To do this stuff we'll need to return an entity from the url handler?
        #if 'num_files' in entity.facts:
        #    num_files = int(entity['num_files'])
        #    print repr(num_files)
#
#            if num_files != num_downloads:
#                self.log.info('Downloaded %d more files from previous scrape' %
#                              num_downloads - num_files)
#            entity[u'num_files'] += num_downloads
#        else:
        #entity[u'num_files'] = num_downloads
        #if u'date_added' not in entity.facts:
        #    entity[u'date_added'] = unicode(datetime.utcnow())
        #entity[u'date_last_scraped'] = unicode(datetime.utcnow())

        if 'changelog' not in entity.facts:
            entity[u'changelog'] = []

        finish = datetime.utcnow()

        changelog = {
            u'start_time': unicode(start),
            u'finish_time': unicode(finish),
            u'elapsed_time': unicode(finish-start),
            u'num_downloads': num_downloads,
            #u'num_children': len(entity.children),
            #~ u'git_commit': self.get_latest_commit_id(),
            }
        entity[u'changelog'].append(changelog)

        DBSession.commit()

        self.log.info("== Statistics ==")
        self.log.info("Scraped url: " + url)
        self.log.info("Number of downloaded files: %d" % num_downloads)