예제 #1
0
    def process_user(self, job, js, headers, raw_response):
        """
        Process user detail data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        if 'id' not in js:
            logger.error('Field ID not found in user')
            return

        s = self.session()
        try:
            user_id = int(js['id'])
            dbu = s.query(GitHubUserDetails).filter(
                GitHubUserDetails.id == user_id).one_or_none()
            is_new = False

            if dbu is None:
                is_new = True
                dbu = GitHubUserDetails()
                dbu.id = user_id

            dbu.date_last_check = salch.func.now()
            dbu.username = js['login']
            dbu.name = utils.utf8ize(utils.defvalkey(js, 'name'))

            dbu.company = utils.utf8ize(utils.defvalkey(js, 'company'))
            dbu.blog = utils.defvalkey(js, 'blog')
            dbu.email = utils.defvalkey(js, 'email')
            dbu.bio = utils.utf8ize(utils.defvalkey(js, 'bio'))
            dbu.usr_type = utils.defvalkey(js, 'type')

            dbu.public_repos = js['public_repos']
            dbu.public_gists = js['public_gists']
            dbu.followers = js['followers']
            dbu.following = js['following']

            dbu.created_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'created_at')))
            dbu.updated_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'updated_at')))

            if is_new:
                s.add(dbu)
            else:
                s.merge(dbu)
            s.commit()
            s.flush()
            s.expunge_all()

        except Exception as e:
            logger.error('Exception storing user details: %s: %s' %
                         (js['id'], e))
            logger.debug(traceback.format_exc())

        finally:
            utils.silent_close(s)
예제 #2
0
    def process_item(self, item, spider):
        """
        Process item for persisting
        :param item: 
        :param spider: 
        :return: 
        """
        try:
            s = self.session()
            if isinstance(item, (PomItem, type(PomItem()), type(PomItem))):
                self.store_pom(item, s)
            elif isinstance(item, (AscItem, type(AscItem()), type(AscItem))):
                self.store_asc(item, s)
            elif isinstance(item, (ArtifactItem, type(ArtifactItem()), type(ArtifactItem))):
                self.store_index(item, s)
            elif isinstance(item, LinkItem):
                pass
            else:
                logger.warning('Unknown item: %s type %s' % (item, type(item)))
                return

            s.commit()
            s.flush()  # writes changes to DB
            s.expunge_all()  # removes objects from session
        except Exception as e:
            logger.warning('Exception in storing key %s' % e)

        finally:
            utils.silent_close(s)
            s = None
        return item
예제 #3
0
    def work(self):
        """
        Entry point after argument processing.
        :return: 
        """
        self.config_file = self.args.config
        self.init_config()
        self.init_db()

        s = self.session()
        js_res = []
        try:
            res = s.query(MavenSignature.sig_key_id).group_by(
                MavenSignature.sig_key_id).all()
            for keyrec in res:
                js_res.append(keyrec.sig_key_id)

            js_res.sort()
            print(json.dumps(js_res))

        except Exception as e:
            logger.error('Exception in dump: %s' % e)
            logger.debug(traceback.format_exc())
        finally:
            utils.silent_close(s)
예제 #4
0
    def process_colab(self, job, js, headers, raw_response):
        """
        Process colaborators for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for colab in js:
            if 'id' not in colab:
                logger.error('Field ID not found in colab')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoColab)\
                    .filter(GitHubRepoColab.user_name == colab['login'])\
                    .filter(GitHubRepoColab.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoColab()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = colab['login']
                dbu.can_pull = colab['permissions']['pull']
                dbu.can_push = colab['permissions']['push']
                dbu.can_admin = colab['permissions']['admin']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing colab details: %s:%s: %s' %
                             (colab['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_COLAB_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_COLAB,
                          meta=new_meta)
        self.link_queue.put(job)
예제 #5
0
    def store_users_list(self, users):
        """
        Stores all user in the list
        :param users
        :return:
        """
        # Handling gaps in the user space ID. With user-only optimization it causes
        # overlaps.
        reduced_by = 0
        with self.processed_user_set_lock:
            ids = [user.user_id for user in users]
            ids_ok = []
            for id in ids:
                if id in self.processed_user_set:
                    reduced_by += 1
                    continue
                self.processed_user_set.add(id)
                ids_ok.append(id)
            users = [user for user in users if user.user_id in ids_ok]

        # Bulk user load
        s = self.session()
        id_list = sorted([user.user_id for user in users])
        db_users = s.query(GitHubUserDb).filter(
            GitHubUserDb.id.in_(id_list)).all()
        db_user_map = {user.id: user for user in db_users}

        for user in users:
            self.new_users_events.insert()

            # Store user to the DB
            try:
                db_user = utils.defvalkey(db_user_map, key=user.user_id)
                self.store_user(user, s, db_user=db_user, db_user_loaded=True)

            except Exception as e:
                logger.warning('[%02d] Exception in storing user %s' %
                               (self.local_data.idx, e))
                self.trace_logger.log(e)
                logger.info('[%02d] idlist: %s' %
                            (self.local_data.idx, id_list))
                self.trigger_quit()
                break

        try:
            s.commit()
            # logger.info('[%02d] Commited, reduced by: %s' % (self.local_data.idx, reduced_by))
        except Exception as e:
            logger.warning('[%02d] Exception in storing bulk users' %
                           self.local_data.idx)
            logger.warning(traceback.format_exc())
            logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list))
            self.trigger_quit()
        finally:
            utils.silent_close(s)
예제 #6
0
    def process_assignee(self, job, js, headers, raw_response):
        """
        Process assignees for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for assignee in js:
            if 'id' not in assignee:
                logger.error('Field ID not found in assignees')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoAssignee)\
                    .filter(GitHubRepoAssignee.user_name == assignee['login'])\
                    .filter(GitHubRepoAssignee.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoAssignee()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = assignee['login']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing cassignee details: %s:%s: %s' %
                             (assignee['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_ASSIGNEES_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                          meta=new_meta)
        self.link_queue.put(job)
예제 #7
0
    def flush_sqlite(self, buff):
        if len(buff) == 0:
            return
        if self.sqlite_file is None:
            return
        if not self.sqlite_data:
            return

        s = self.sqlite_session()
        for elem in buff:
            s.merge(elem)

        logger.debug('Committing %d elems %s' % (len(buff), s))
        s.flush()
        s.commit()
        utils.silent_close(s)
예제 #8
0
    def _init_queue(self):
        """
        Initializes link queue
        :return: 
        """
        sess = self.session()
        logger.debug('Loading users...')

        users_cnt = 0
        all_keys = sess.query(GitHubKey).filter(
            GitHubKey.is_interesting == 1).all()
        logger.debug('All users loaded')

        for rec in all_keys:
            users_cnt += 1

            job = DownloadJob(url=self.USER_DETAIL_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_USER,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

            job = DownloadJob(url=self.USER_ORGS_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_ORG,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

            job = DownloadJob(url=self.USER_REPOS_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

        logger.info('Queue initialized, users cnt: %s' % users_cnt)
        utils.silent_close(sess)
예제 #9
0
    def fill_user_key_links(self):
        """
        Loads next X users from the database, advances since_id
        :return:
        """
        # self.since_id
        s = self.session()
        try:
            db_users = s.query(GitHubUserDb)\
                .filter(GitHubUserDb.id > self.since_id)\
                .order_by(GitHubUserDb.id)\
                .limit(self.user_load_bulk)\
                .all()

            for user in db_users:
                key_url = self.KEYS_ID_URL % user.id
                github_user = GitHubUser(user_id=user.id,
                                         user_name=user.username,
                                         user_type=user.usr_type,
                                         user_url=self.USER_URL %
                                         user.username)
                new_job = DownloadJob(url=key_url,
                                      jtype=DownloadJob.TYPE_KEYS,
                                      user=github_user,
                                      priority=random.randint(0, 1000),
                                      time_added=time.time())
                self.link_queue.put(new_job)

                if user.id > self.since_id:
                    self.since_id = user.id

        except Exception as e:
            logger.warning('Exception in loading users: %s' % e)
            utils.silent_rollback(s)

        finally:
            utils.silent_close(s)
예제 #10
0
    def process_keys_data(self, job, js, headers, raw_response):
        """
        Processing key loaded data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        js_empty = js is None or len(js) == 0

        # Expect failures, commit everything before
        if self.merge and not js_empty:
            try:
                s = self.session()
                s.commit()
            except Exception as e:
                logger.warning('Could not pre-commit: %s' % e)

        # Store each key.
        key_ids = []
        for key in js:
            s = None
            self.new_keys_events.insert()

            try:
                s = self.session()
                self.store_key(job.user, key, s)
                key_ids.append(int(key['id']))
                s.commit()

                self.assoc_key(job.user.user_id, key['id'], s)
                s.commit()

                s.flush()  # writes changes to DB
                s.expunge_all()  # removes objects from session

            except Exception as e:
                logger.warning('Exception in storing key %s' % e)
                self.trace_logger.log(e)

            finally:
                utils.silent_close(s)
                s = None

        # Deassoc lost keys
        try:
            s = self.session()
            self.deassoc_lost_keys(job.user.user_id, key_ids, s)
            s.commit()

        except Exception as e:
            logger.warning('Exception in deassoc for users %s : %s' %
                           (job.user.user_id, e))
            self.trace_logger.log(e)

        finally:
            utils.silent_close(s)
            s = None

        self.on_keys_processed()
예제 #11
0
    def parse_page(self, response):
        """
        General page parser
        :param response: 
        :return: 
        """
        links_visit = set()
        links = set()
        for link in LxmlLinkExtractor(allow=(), deny=()).extract_links(response):
            # Add all links except up link.
            if link.text != '../':
                links.add(link.url)

        # Links extracted from the current page.
        # Extract links only if landed in the artifact directory.
        is_artifact = False
        art_conf = 0
        if len(links) < 100:
            art_conf += 3

        versions = []
        misc_files = []
        for link in links:
            if link.endswith('/maven-metadata.xml'):
                is_artifact = True

            last_segment = link
            last_slash = link[-1] == '/'

            if last_slash:
                last_segment = link[0:-1]
            last_segment = last_segment.rsplit('/', 1)[1]

            if self.is_version_folder(last_segment):
                art_conf += 1
                versions.append({'v': last_segment, 'l': self.remove_prefix(link, response.url)})

            elif link != response.url:
                misc_files.append(self.remove_prefix(link, response.url))

        # TODO: if non-standard format, download also maven-metadata.xml
        # Store only artifacts related URLs
        if is_artifact or art_conf > 5:
            logger.info('New artifact(%s), confidence(%s): %s' % (is_artifact, art_conf, response.url))
            item = ArtifactItem()
            item['url'] = response.url
            item['versions'] = versions
            item['misc_files'] = misc_files
            item['artifact_detected'] = is_artifact
            item['confidence'] = art_conf
            yield item

            # Generate request for the newest version
            if is_artifact and len(versions) > 0:
                cur_sess = None
                try:
                    cur_sess = self.session()

                    burl = utils.strip_leading_slash(response.url)
                    grp_id, art_id = get_maven_id_from_url(burl)

                    for cur_version in pick_versions([x['v'] for x in versions]):
                        if self.pom_exists(grp_id, art_id, cur_version, cur_sess):
                            continue
                        logger.info('Enqueueing artifact %s %s %s' % (grp_id, art_id, cur_version))
                        meta = {'burl': burl, 'artifact_id': art_id, 'group_id': grp_id,
                                'max_version': cur_version}
                        art_url = '%s/%s' % (burl, cur_version)
                        art_base_name = '%s-%s' % (art_id, cur_version)
                        pom_link = '%s/%s.pom' % (art_url, art_base_name)
                        yield Request(pom_link, callback=self.parse_pom, meta=dict(meta))

                except Exception as e:
                    logger.debug('Exception in POM exist check: %s, self: %s, sess: %s' % (e, self, self.session))
                    logger.debug(traceback.format_exc())
                    utils.silent_close(cur_sess)

            # Case: maven-metadata is present, but we have also another directories here -> crawl it.
            # otherwise do not follow any more links from this page.
            base_url = response.url
            if base_url[-1] != '/':
                base_url += '/'

            links = [base_url + x for x in misc_files if x.endswith('/')]

        # Links post processing
        for link in links:
            if not self.should_follow_link(link, response):
                continue
            links_visit.add(link)

        logger.debug('Extracted %s links from %s' % (len(links_visit), response.url))
        for link in list(links_visit):
            yield Request(link, callback=self.parse_page)
예제 #12
0
    def process_repo(self, job, js, headers, raw_response, from_user):
        """
        Process repo list page
        :param job: 
        :param js: 
        :param headers: 
        :param raw_response: 
        :param from_user: 
        :return: 
        """
        for repo in js:
            if 'id' not in repo:
                logger.error('Field ID not found in repos')
                continue

            s = self.session()
            try:
                repo_id = int(repo['id'])
                dbe = s.query(GitHubRepo).filter(
                    GitHubRepo.id == repo_id).one_or_none()

                dbu = GitHubRepo()
                dbu.id = repo_id
                dbu.user_repo = from_user
                if from_user:
                    dbu.username = job.meta['user']
                else:
                    dbu.org_name = job.meta['org']

                if 'owner' in repo:
                    dbu.owner_id = repo['owner']['id']
                    dbu.owner_login = repo['owner']['login']

                dbu.repo_name = repo['full_name']
                dbu.repo_stars = repo['stargazers_count']
                dbu.repo_forks = repo['forks']
                dbu.repo_watchers = repo['watchers']
                dbu.repo_is_fork = repo['fork']
                dbu.repo_size = repo['size']
                dbu.repo_homepage = utils.defvalkey(repo, 'homepage')
                dbu.repo_language = utils.defvalkey(repo, 'language')
                dbu.created_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'created_at')))
                dbu.updated_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'updated_at')))
                dbu.pushed_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'pushed_at')))

                dbu.repo_description = utils.utf8ize(repo['description'])

                dbu.repo_stargazers_url = repo['stargazers_url']
                dbu.repo_forks_url = repo['forks_url']

                if not from_user and repo['stargazers_count'] > 100:
                    new_meta = dict(job.meta)
                    new_meta['page'] = 1
                    new_meta['repo'] = repo['full_name']
                    new_meta['owner'] = repo['owner']['login']

                    # Colab fetch - skip, no auth
                    job = DownloadJob(url=self.ORG_REPO_COLAB_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_COLAB,
                                      meta=new_meta)

                    # Asignee fetch
                    job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                                      meta=dict(new_meta))

                    self.link_queue.put(job)

                # DB save
                if dbe is None:
                    s.add(dbu)

                else:
                    if dbe.username != dbu.username:
                        logger.warning('Username does not match for %s %s %s' %
                                       (repo_id, dbe.username, dbu.username))
                    if dbe.org_name != dbu.org_name:
                        logger.warning('org_name does not match for %s %s %s' %
                                       (repo_id, dbe.org_name, dbu.org_name))
                    if dbe.owner_login != dbu.owner_login:
                        logger.warning(
                            'owner_login does not match for %s %s %s' %
                            (repo_id, dbe.owner_login, dbu.owner_login))

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error(
                    'Exception storing repo details: %s:%s meta: %s, url: %s, exc: %s'
                    % (repo['id'], repo['full_name'], json.dumps(
                        job.meta), job.url, e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        if from_user:
            new_url = (self.USER_REPOS_URL %
                       job.meta['user']) + ('?page=%s' % (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta=new_meta)
        else:
            new_url = (self.ORG_REPOS_URL % job.meta['org']) + ('?page=%s' %
                                                                (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)

        self.link_queue.put(job)
예제 #13
0
    def process_org(self, job, js, headers, raw_response):
        """
        Process user -> orgs data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        new_orgs = []
        for org in js:
            if 'id' not in org:
                logger.error('Field ID not found in orgs')
                continue

            s = self.session()
            try:
                org_id = int(org['id'])

                # delete first - avoid excs
                s.query(GitHubUserOrgs)\
                    .filter(GitHubUserOrgs.org_id == org_id)\
                    .filter(GitHubUserOrgs.username == job.meta['user'])\
                    .delete()

                dbu = GitHubUserOrgs()
                dbu.username = job.meta['user']
                dbu.org_id = org['id']
                dbu.org_name = org['login']
                dbu.org_desc = utils.utf8ize(org['description'])
                new_orgs.append(org['login'])

                s.add(dbu)

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing user->org details: %s: %s' %
                             (org['id'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.USER_ORGS_URL % job.meta['user']) + ('?page=%s' %
                                                             (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_ORG,
                          meta=new_meta)
        self.link_queue.put(job)

        # Load repositories for new organisations
        not_loaded_orgs = None
        with self.orgs_loaded_lock:
            new_orgs_set = set(new_orgs)
            not_loaded_orgs = new_orgs_set - self.orgs_loaded_set
            for x in new_orgs:
                self.orgs_loaded_set.add(x)

        for x in not_loaded_orgs:
            new_meta = dict(job.meta)
            new_meta['page'] = 1
            new_meta['org'] = x
            job = DownloadJob(url=self.ORG_REPOS_URL % x,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)
            self.link_queue.put(job)